Objective: Summary of the demographics and soil characteristics of the Rwanda long term soil health study.

Study methodology

Add in details and links on study methodology here.

library(knitr)
library(ggplot2)
library(stringr)
suppressMessages(library(dplyr))
library(sp)
suppressMessages(library(rgdal))
suppressMessages(library(dismo))
suppressMessages(library(stargazer))
library(leaflet)
library(XML)
suppressMessages(library(maptools))
library(automap)
suppressMessages(library(RStata))
suppressMessages(library(fields))
library(gstat)
library(htmltools)
suppressMessages(library(Matching))
library(reshape2)
options("RStata.StataVersion" = 12)
options("RStata.StataPath" = "/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
#chooseStataBin("/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
wd <- "/Users/mlowes/drive/soil health study/data/rw baseline"
dd <- paste(wd, "data", sep = "/")
od <- paste(wd, "output", sep="/")
md <- paste(wd, "maps", sep="/")
drive <- "~/drive/r_help/4_output/statistical_test_outputs"

#load data:
# This data is being drawn from the Soil lab repository. It has the baseline data with it
# d <- read.csv(paste(dd, "Rwanda_shs_commcare_soil_data_final.csv", sep="/"),  stringsAsFactors=FALSE)

Combine survey and soil data

Replicate Alex’s and Emmanuel’s merge process using “Identifiers with SSN_final” provided by Emmanuel. I use the RStata package found here

# I'm replicating Alex's do file here.
stata("merge_shs.do")
## . * merge raw commcare data with soil database
## . 
## . * date: 11 july 2016
## . 
## . clear all
## . set more off
## . 
## . * set directory
## . global wd "/Users/mlowes/drive/soil health study/data/rw baseline"
## . global dd "/Users/mlowes/drive/soil health study/data/rw baseline/data"
## . global troubleshoot "/Users/mlowes/drive/soil health study/data/rw baseline/t
## > roubleshoot"
## . 
## . * insheet data
## . insheet using "$dd/raw_rwanda_commcare_shs.csv", clear
## (94 vars, 2491 obs)
## . 
## . * clean sample_id variable
## . replace sample_id = "" if sample_id == "---"
## (10 real changes made)
## . replace sample_id = "-99" if sample_id == "99"
## (19 real changes made)
## . 
## . destring sample_id, replace
## sample_id has all characters numeric; replaced as int
## (10 missing values generated)
## . 
## . * manipulate current "sample_id" to become a proper alpha-numeric unique iden
## > tifier
## . * this simply involves adding "C" to each numeric code that belongs to a cont
## > rol farmer
## . 
## . * variables of interest:
## .       * d_client
## .       * sample_id
## . 
## . ren demographic_infod_client d_client
## . replace d_client = "" if d_client == "---"
## (10 real changes made)
## . destring d_client, replace
## d_client has all characters numeric; replaced as byte
## (10 missing values generated)
## . 
## . tostring sample_id, gen(sample_id_string)
## sample_id_string generated as str4
## . 
## . replace sample_id_string = sample_id_string + "C" if d_client == 0
## sample_id_string was str4 now str5
## (1236 real changes made)
## . drop sample_id 
## . ren sample_id_string sample_id
## . 
## . ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
## > ***
## . * quantify and note cases where sample_id appears more than twice
## .      * 39 codes appear 2x
## .      * 1 code appears 3x
## .      * 1 code appears 4x
## .      * 1 code appears 10x
## .      * 1 code appears 19x
## . 
## . duplicates report sample_id
## 
## Duplicates in terms of sample_id
## 
## --------------------------------------
##    copies | observations       surplus
## ----------+---------------------------
##         1 |         2377             0
##         2 |           78            39
##         3 |            3             2
##         4 |            4             3
##        10 |           10             9
##        19 |           19            18
## --------------------------------------
## . duplicates tag sample_id, gen(n_duplicate_id)
## 
## Duplicates in terms of sample_id
## . gen d_id_problem = 1 if n_duplicate_id != 0
## (2377 missing values generated)
## . replace d_id_problem = 0 if missing(d_id_problem)
## (2377 real changes made)
## . 
## . ** need to investigate 5% of observations (114 samples), i.e. d_id_problem = 
## > 1**
## . ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
## > ***
## . 
## . * outsheet problematic observations and examine 1-by-1
## . sort sample_id
## . outsheet demographicnom_enqueteur district demographic_infocellule_selected d
## > emographic_infoumudugudu d_client demographic_infonom_cultivateur sample_id d
## > emographic_infosex demographic_infoage_cultivateur demographic_infon_telephon
## > e_cult d_id_problem using "$dd/shs_to_check.csv" if d_id_problem == 1, replac
## > e comma
## . 
## . **************** post-investigation correction of incorrect ids *************
## > ***
## . 
## . ren demographic_infonom_cultivateur farmer_name
## . ren demographic_infocellule_selected selected_cell
## . ren demographicnom_enqueteur enumerator_name
## . 
## . * drop test/accidentally double-entered observations
## . drop if infoformid == "f138897f-6e80-4b0a-9db3-ab2134cd9e51"
## (1 observation deleted)
## . drop if farmer_name == "Mugisha"
## (1 observation deleted)
## . drop if farmer_name == "Muhunde yoramu"
## (1 observation deleted)
## . drop if farmer_name == "Test"
## (1 observation deleted)
## . drop if farmer_name == "Billy"
## (1 observation deleted)
## . drop if farmer_name == "Gakuba michel"
## (2 observations deleted)
## . drop if farmer_name == "Renata"
## (1 observation deleted)
## . drop if farmer_name == "Bub"
## (1 observation deleted)
## . drop if farmer_name == "M"
## (1 observation deleted)
## . drop if farmer_name == "Anita"
## (1 observation deleted)
## . drop if farmer_name == "Jean"
## (1 observation deleted)
## . drop if farmer_name == "Nyirazaninka lea"
## (1 observation deleted)
## . drop if farmer_name == "Mugiraneza Pacifiue"
## (1 observation deleted)
## . drop if farmer_name == "Nsengiyaremye Innocent"
## (1 observation deleted)
## . 
## . * correct incorrectly recorded ids
## . replace sample_id = "880C" if farmer_name == "Nyiransanzineza Marie Rose"
## (1 real change made)
## . replace sample_id = "811C" if farmer_name == "Ngirabatware  Daniel"
## (1 real change made)
## . replace sample_id = "646C" if farmer_name == "Nyirasabwa anonciata"
## (1 real change made)
## . replace sample_id = "624C" if farmer_name == "Nyirinkindi j Bosco"
## (1 real change made)
## . replace sample_id = "560C" if farmer_name == "Mubirigi Bertin"
## (1 real change made)
## . replace d_client = 0 if farmer_name == "Mubirigi Bertin"
## (1 real change made)
## . replace sample_id = "375C" if farmer_name == "Hitimana ezecquiere"
## (1 real change made)
## . replace sample_id = "322C" if farmer_name == "Nyirantabire arrivera"
## (1 real change made)
## . replace sample_id = "2566C" if farmer_name == "MUSABYIMANA Colletta"
## (1 real change made)
## . replace sample_id = "2399C" if farmer_name == "Nyirabajyambere anastisia"
## (1 real change made)
## . replace d_client = 0 if farmer_name == "Nyirabajyambere anastisia"
## (1 real change made)
## . replace sample_id = "2280C" if farmer_name == "Zihabake Simon"
## (1 real change made)
## . replace sample_id = "2202C" if farmer_name == "mugiraneza pacifique"
## (1 real change made)
## . replace sample_id = "1780C" if farmer_name == "Hakuzimana Theophile"
## (1 real change made)
## . replace sample_id = "1757C" if farmer_name == "Bavugirije Feleciane"
## (1 real change made)
## . replace sample_id = "1626C" if farmer_name == "Nyabivumu"
## (1 real change made)
## . replace sample_id = "1575C" if farmer_name == "BAHIMBANDE Beriyana."
## (1 real change made)
## . replace d_client = 0 if farmer_name == "BAHIMBANDE Beriyana."
## (1 real change made)
## . replace sample_id = "1103C" if farmer_name == "Barirwanda Elyse"
## (1 real change made)
## . replace sample_id = "1037C" if farmer_name == "Mukashema Faraziya"
## (1 real change made)
## . replace d_client = 0 if farmer_name == "Mukashema Faraziya" 
## (1 real change made)
## . replace sample_id = "3069" if farmer_name == "Ndababonye Silas"
## (1 real change made)
## . replace sample_id = "2611" if farmer_name == "Sebazungu modeste"
## (1 real change made)
## . replace d_client = 1 if farmer_name == "Sebazungu modeste"
## (1 real change made)
## . replace sample_id = "2415" if farmer_name == "Nkaka joseph"
## (1 real change made)
## . replace sample_id = "2412" if farmer_name == "Ndagijimana alexis"
## (1 real change made)
## . replace sample_id = "2410" if farmer_name == "Sebuhoro elie"
## (1 real change made)
## . replace sample_id = "2406" if farmer_name == "Mugisha ruth"
## (1 real change made)
## . replace sample_id = "2405" if farmer_name == "Mboneza hasan"
## (1 real change made)
## . replace sample_id = "2404" if farmer_name == "Ntibitonderwa veriane"
## (1 real change made)
## . replace sample_id = "2399" if farmer_name == "Nyiraburakeye feresita"
## (1 real change made)
## . replace sample_id = "2395" if farmer_name == "Uwimana danier"
## (1 real change made)
## . replace sample_id = "2394" if farmer_name == "Bizinde silas"
## (1 real change made)
## . replace sample_id = "2390" if farmer_name == "Nyirahabimvana keresesiya"
## (1 real change made)
## . replace sample_id = "2388" if farmer_name == "Nyirasekerabanzi tharcilla"
## (1 real change made)
## . replace sample_id = "2386" if farmer_name == "Ndererimana emmanuel"
## (1 real change made)
## . replace sample_id = "2291" if farmer_name == "Ndacyayisenga Feresiyani"
## (1 real change made)
## . replace sample_id = "2184" if farmer_name == "Muhawenimana Beretirida"
## (1 real change made)
## . replace sample_id = "2145" if farmer_name == "Fapfakwita celestine"
## (1 real change made)
## . replace sample_id = "2141" if farmer_name == "Nsangiranabo krizoroji"
## (1 real change made)
## . replace sample_id = "2140" if farmer_name == "Nsabyumuremyi anakereti"
## (1 real change made)
## . replace sample_id = "2136" if farmer_name == "Nyirantibitonderwa savoroniya"
## (1 real change made)
## . replace sample_id = "2131" if farmer_name == "Hagenimana Ewufroni"
## (1 real change made)
## . replace sample_id = "2066" if farmer_name == "Hitayezu vicent"
## (1 real change made)
## . replace d_client = 1 if farmer_name == "Hitayezu vicent"
## (1 real change made)
## . replace sample_id = "2044" if farmer_name == "Niyodushima janette"
## (1 real change made)
## . replace d_client = 1 if farmer_name == "Niyodushima janette"
## (1 real change made)
## . replace sample_id = "2024" if farmer_name == "Mukabatsinda veren A"
## (1 real change made)
## . replace sample_id = "2020" if farmer_name == "Sibomana innocent"
## (1 real change made)
## . replace sample_id = "2009" if farmer_name == "Nsabimana inyasi"
## (1 real change made)
## . replace sample_id = "2002" if farmer_name == "Mateso elizabet"
## (1 real change made)
## . replace sample_id = "1889" if farmer_name == "Niyonsaba Daniel"
## (1 real change made)
## . replace sample_id = "1864" if farmer_name == "Mukankubana Souzanne"
## (1 real change made)
## . replace sample_id = "1780" if farmer_name == "Nyirantihabose Annonciatha"
## (1 real change made)
## . replace sample_id = "1771" if farmer_name == "Bizimungu Damascne"
## (1 real change made)
## . replace sample_id = "1675" if farmer_name == "Bikorimana Isaac"
## (1 real change made)
## . replace sample_id = "1103" if farmer_name == "Munyabarata japhette"
## (1 real change made)
## . replace sample_id = "954" if farmer_name == "Hahirwabake silver"
## (1 real change made)
## . replace sample_id = "541" if farmer_name == "Rugemintwaza Vianney"
## (1 real change made)
## . replace sample_id = "454" if farmer_name == "Niyonizeye fororida"
## (1 real change made)
## . replace sample_id = "407" if farmer_name == "Munyaneza jean pierre"
## (1 real change made)
## . replace sample_id = "375" if farmer_name == "Munyarubuga nanias"
## (1 real change made)
## . replace sample_id = "322" if farmer_name == "Gacandaga zackarie"
## (1 real change made)
## . replace sample_id = "262" if farmer_name == "MUHIRWA J Damascene"
## (1 real change made)
## . replace sample_id = "32" if farmer_name == "Nsanzemuhire"
## (1 real change made)
## . 
## . *** re-run duplicates test
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## 
## Duplicates in terms of sample_id
## 
## --------------------------------------
##    copies | observations       surplus
## ----------+---------------------------
##         1 |         2464             0
##         2 |           12             6
## --------------------------------------
## . duplicates tag sample_id, gen(n_duplicate_id)
## 
## Duplicates in terms of sample_id
## . gen d_id_problem = 1 if n_duplicate_id != 0
## (2464 missing values generated)
## . replace d_id_problem = 0 if missing(d_id_problem)
## (2464 real changes made)
## . 
## . * second try: outsheet problematic observations and examine 1-by-1
## . sort sample_id
## . outsheet using "$dd/shs_to_check_2.csv" if d_id_problem == 1, replace comma
## . 
## . * second try: correct incorrect ids
## . 
## . replace sample_id = "2202" if farmer_name == "Mukamurara scholastique"
## (1 real change made)
## . replace d_client = 1 if farmer_name == "Mukamurara scholastique"
## (1 real change made)
## . replace sample_id = "2195" if farmer_name == "Ndacyayisenga Feresiyani"
## (1 real change made)
## . replace sample_id = "824" if farmer_name == "Ngirabatware  Daniel"
## (1 real change made)
## . 
## . replace sample_id = "" if rownumber == 238
## (1 real change made)
## . replace sample_id = "" if rownumber == 848
## (1 real change made)
## . replace sample_id = "" if rownumber == 1257
## (1 real change made)
## . replace sample_id = "" if rownumber == 1732
## (1 real change made)
## . 
## . *** re-run duplicates test
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## 
## Duplicates in terms of sample_id
## 
## --------------------------------------
##    copies | observations       surplus
## ----------+---------------------------
##         1 |         2472             0
##         4 |            4             3
## --------------------------------------
## . duplicates tag sample_id, gen(n_duplicate_id)
## 
## Duplicates in terms of sample_id
## . gen d_id_problem = 1 if n_duplicate_id != 0
## (2472 missing values generated)
## . replace d_id_problem = 0 if missing(d_id_problem)
## (2472 real changes made)
## . 
## . * third try: outsheet problematic observations
## . sort sample_id
## . outsheet using "$dd/shs_to_check_3.csv" if d_id_problem == 1, replace comma
## . 
## . *** 4 sets of duplicates remain (2 observations)
## . drop if d_id_problem == 1
## (4 observations deleted)
## . save "$dd/rwanda_commcare_shs_clean.dta", replace
## file /Users/mlowes/drive/soil health study/data/rw baseline/data/rwanda_commcar
## > e_shs_clean.dta saved
## . outsheet using "$dd/rwanda_commcare_shs_clean.csv", comma replace
## . 
## . * merge datasets
## . insheet using "$dd/mne_rwanda_database_shs.csv", clear
## (28 vars, 2483 obs)
## . 
## . save "$dd/mne_rwanda_database_shs.dta", replace
## file /Users/mlowes/drive/soil health study/data/rw baseline/data/mne_rwanda_dat
## > abase_shs.dta saved
## . 
## . use "$dd/rwanda_commcare_shs_clean.dta", clear
## . 
## . merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
## 
##     Result                           # of obs.
##     -----------------------------------------
##     not matched                            93
##         from master                        41  (_merge==1)
##         from using                         52  (_merge==2)
## 
##     matched                             2,431  (_merge==3)
##     -----------------------------------------
## . 
## .    * 93 observations unmatched... :(
## . 
## . * investigate unmatched observations
## . sort sample_id
## . outsheet using "$troubleshoot/unmatched_commcare.csv" if _merge == 1, comma r
## > eplace
## . outsheet using "$troubleshoot/unmatched_inventory.csv" if _merge == 2, comma 
## > replace
## . 
## . * make first round of corrections in master data
## . use "$dd/rwanda_commcare_shs_clean.dta", clear
## . 
## . replace sample_id = "2711C" if farmer_name == "Kamuhanda Elie"
## (1 real change made)
## . replace sample_id = "260C" if farmer_name == "HATEGEKIMANA Mathias"
## (1 real change made)
## . replace sample_id = "245" if farmer_name == "Kayiranga Michel"
## (1 real change made)
## . replace sample_id = "1543" if farmer_name == "Ndagijimana Eliezel"
## (1 real change made)
## . replace sample_id = "1543C" if farmer_name == "Mukantanganzwa odeta"
## (1 real change made)
## . replace sample_id = "1632" if farmer_name == "Nyiranziguye Cecile"
## (1 real change made)
## . replace sample_id = "2782" if farmer_name == "Mukamungu leocadie"
## (1 real change made)
## . replace sample_id = "421C" if farmer_name == "Niyonizeye Francine"
## (1 real change made)
## . replace sample_id = "828C" if farmer_name == "Mukamazimpaka Terese"
## (1 real change made)
## . replace sample_id = "364" if farmer_name == "Mukamuyango verina"
## (1 real change made)
## . replace sample_id = "364C" if farmer_name == "Mujawimana jeanne"
## (1 real change made)
## . replace sample_id = "368" if farmer_name == "Nyiramanzi jean damascene"
## (1 real change made)
## . replace sample_id = "368C" if farmer_name == "Karwiyegura rachel"
## (1 real change made)
## . replace sample_id = "391" if farmer_name == "Mugemane augistin"
## (1 real change made)
## . replace sample_id = "391C" if farmer_name == "Ngirishuti sumayire"
## (1 real change made)
## . replace sample_id = "395" if farmer_name == "Niyomugabo amiel"
## (1 real change made)
## . replace sample_id = "395C" if farmer_name == "Ntawiniga augustin"
## (1 real change made)
## . replace sample_id = "396" if farmer_name == "Mpayimana philippe"
## (1 real change made)
## . replace sample_id = "329" if farmer_name == "Mucyezangango emmanuel"
## (1 real change made)
## . replace sample_id = "411" if farmer_name == "Nsabimana mathias"
## (1 real change made)
## . replace sample_id = "411C" if farmer_name == "Singirankabo"
## (1 real change made)
## . replace sample_id = "414" if farmer_name == "Rutaharama eldephonse"
## (1 real change made)
## . replace sample_id = "329C" if farmer_name == "Mukamana josephine"
## (1 real change made)
## . replace sample_id = "657" if farmer_name == "Nyandwi vincent"
## (1 real change made)
## . replace sample_id = "569C" if farmer_name == "Uwimana Bercilla"
## (1 real change made)
## . replace sample_id = "2051C" if farmer_name == "Ntambabazi Anastase"
## (1 real change made)
## . replace sample_id = "2249" if farmer_name == "Uwineza valentine"
## (1 real change made)
## . replace sample_id = "2360C" if farmer_name == "Bamporineza j deDieu"
## (1 real change made)
## . replace sample_id = "2001" if farmer_name == "Nyiramakuba thanene"
## (1 real change made)
## . replace sample_id = "2897C" if farmer_name == "Mukagatanazi Marianne"
## (1 real change made)
## . replace sample_id = "2965C" if farmer_name == "Twizeyimana Theoneste"
## (1 real change made)
## . 
## . drop if farmer_name == "Yjk"
## (1 observation deleted)
## . drop if farmer_name == "Uwambajimana Agnes"
## (1 observation deleted)
## . drop if farmer_name == "Tr"
## (1 observation deleted)
## . 
## . replace sample_id = "2415C" if farmer_name == "Nyiramahirwe Frolance"
## (1 real change made)
## . replace sample_id = "851" if farmer_name == "Bigirimana  Amon"
## (1 real change made)
## . 
## . 
## . * duplicates check
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## 
## Duplicates in terms of sample_id
## 
## --------------------------------------
##    copies | observations       surplus
## ----------+---------------------------
##         1 |         2469             0
## --------------------------------------
## . duplicates tag sample_id, gen(n_duplicate_id)
## 
## Duplicates in terms of sample_id
## . gen d_id_problem = 1 if n_duplicate_id != 0
## (2469 missing values generated)
## . replace d_id_problem = 0 if missing(d_id_problem)
## (2469 real changes made)
## . 
## . * second try: merge
## . merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
## 
##     Result                           # of obs.
##     -----------------------------------------
##     not matched                            24
##         from master                         5  (_merge==1)
##         from using                         19  (_merge==2)
## 
##     matched                             2,464  (_merge==3)
##     -----------------------------------------
## . 
## . * deal with 30 unmatched cases
## . sort sample_id
## . outsheet using "$troubleshoot/unmatched_commcare_2.csv" if _merge == 1, comma
## >  replace
## . outsheet using "$troubleshoot/unmatched_inventory_2.csv" if _merge == 2, comm
## > a replace
## . 
## . * 5 submitted surveys with no corresponding sample
## . *drop if _merge == 1
## . 
## . * 19 samples with no corresponding survey
## . *drop if _merge == 2
## . 
## . * outsheet merged database
## . outsheet using "$dd/rwanda_shs_baseline_data.csv", comma replace
## . * THIS IS THE CLEAN VERSION! ONLY USE THIS! DON'T USE OTHER THINGS!
## .

Import the results of Alex’s do file.

d <- read.csv(paste(dd, "rwanda_shs_baseline_data.csv", sep = "/"), stringsAsFactors = F)
Identifiers <- read.csv(paste(dd, "Identifiers with SSN_final.csv", sep = "/"), stringsAsFactors = F)

wetChem <- read.csv(paste("/Users/mlowes/drive/JuPyteR/robert.on@oneacrefund.org/Rwanda Acidity", "Original chem_rwanda_shs.csv", sep = "/"))


d <- left_join(d, Identifiers, by="sample_id")

# now import the soil results and merge with survey data
soilRF <- read.csv(paste(dd, "rwshs_rfresults.csv", sep = "/"), stringsAsFactors = F)
names(soilRF)[names(soilRF)=="X"] <- "SSN"

d <- left_join(d, soilRF, by="SSN")

Cleaning baseline variables

Now let’s start cleaning the demographic variables

# take out weird CommCare stuff
d[d=="---"] <- NA

names(d) <- gsub("text_final_questions", "", names(d))
names(d) <- gsub("intro_champ_echantillon", "", names(d))
names(d) <- gsub("demographic_info", "", names(d))
names(d) <- gsub("other_inputs_", "", names(d))
names(d) <- gsub("crop1_15b_inputs", "", names(d))
names(d) <- gsub("crop2_15b_inputs", "", names(d))
names(d) <- gsub("^15b", "", names(d))
names(d) <- gsub("historical_intro", "", names(d))

names(d)[names(d)=="field_dim"] <- "field_dim1"
names(d)[names(d)=="v51"] <- "field_dim2"

Address unusual field sizes

ggplot(d, aes(x=field_dim1, y=field_dim2)) + geom_point()
## Warning: Removed 19 rows containing missing values (geom_point).

It seems really unlikely that fields are 200 meters long while only being 20 meters wide. I don’t know how to check this though.

# clean field dimensions here - winsor the values to something reasonable.
d[(d$field_dim1>=100 | d$field_dim2>=100) & !is.na(d$field_dim1), c("field_dim1", "field_dim2")]
##      field_dim1 field_dim2
## 125          85        125
## 126         200         10
## 127          20        152
## 130         140        250
## 140          50        135
## 211          12        107
## 415           5        135
## 452         108         65
## 626         100         40
## 836          12        153
## 844          10        212
## 1153        150         20
## 1154        150        160
## 1155        100         40
## 1239        102         57
## 1241        112         13
## 1314        140          8
## 1345        104         24
## 1347        123          6
## 1378        120          9
## 1388        100          7
## 1389        200          4
## 1397        160          4
## 1399        150         50
## 1420        200         20
## 1421        200         18
## 1440         30        105
## 1457         29        101
## 1476        102         73
## 1477        108         62
## 1525         12        123
## 1539        100         95
## 1567        100          8
## 1709        100          6
## 1859        157          9
## 1976         15        120
## 1979         25        102
## 2074          7        100
## 2411         30        153
## 2452        203          8

Take care of demographic data formatting issues

# deal with names and drop unnecessary variables
d <- d %>% 
    dplyr::select(-c(rownumber, infoformid, introductiond_accept, photo,
        infocompleted_time, 
        enumerator_name, contains("phone"), farmer_name, farmersurname, farmername,
        d_respondent, additionalsamplepackedandsenttol, additionalsamplerequestedfromlab,
        datedryingcompleteifnecessary, driedindistrictifnecessary, senttohqyo,
        collectedindistrictyo, excessstoredathq_, receivedathq_,dateofinitialdryingifnecessary,
        samplecollectedinfieldyo, field_des, samplewetordry)) %>%
    rename(
    female = sex,
    age = age_cultivateur,
    own = d_own,
    client = d_client) %>%
    mutate(
    female= ifelse(female=="gore", 1,0),
    field.size = field_dim1*field_dim2
    )

d$total.seasons <- apply(d[, grep("d_season_list", names(d))], 1, function(x) {
    sum(x, na.rm=T)})

Clean agronomic practice variables

agVars <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow",
            "n_seasons_leg_1", "n_seasons_leg_2")
summary(d[,agVars])
##  n_season_fert    n_season_compost n_season_lime     n_season_fallow  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.: 0.000   1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 1.000   Median : 5.000   Median : 0.0000   Median : 0.0000  
##  Mean   : 2.041   Mean   : 5.651   Mean   : 0.1819   Mean   : 0.6355  
##  3rd Qu.: 3.000   3rd Qu.:10.000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.0000   Max.   :10.0000  
##  NA's   :19       NA's   :19       NA's   :19        NA's   :19       
##  n_seasons_leg_1  n_seasons_leg_2   
##  Min.   : 0.000   Min.   : -88.000  
##  1st Qu.: 0.000   1st Qu.:   0.000  
##  Median : 1.000   Median :   2.000  
##  Mean   : 2.171   Mean   :   4.368  
##  3rd Qu.: 4.000   3rd Qu.:   5.000  
##  Max.   :10.000   Max.   :3333.000  
##  NA's   :19       NA's   :19

Sort out the legumes as a second crop

table(d$n_seasons_leg_2, useNA = 'ifany')
## 
##  -88    0    1    2    3    4    5    6    7    8    9   10   15   22   50 
##    1  956  221  258  174  130  301   61   34   66   57  199    1    1    1 
##   53   83   88  101  102 3333 <NA> 
##    3    1    1    1    1    1   19
d$n_seasons_leg_2 <- ifelse(d$n_seasons_leg_2 <0 | d$n_seasons_leg_2>10, NA, d$n_seasons_leg_2)

Check out client variable - why are there missing values?

table(d$client, useNA = 'ifany')
## 
##    0    1 <NA> 
## 1233 1236   19
d[is.na(d$client), c("sample_id")]
##  [1] "1189C" "1207C" "1295"  "1295C" "1298C" "1300"  "1300C" "1366C"
##  [9] "1476C" "1485C" "1554"  "1573"  "1614"  "2042"  "2371C" "2373" 
## [17] "2375"  "2382"  "2442"
# replace client based on whether there is a C in the client variable.
d$client.check <- ifelse(grepl("C", d$sample_id)==T, 0, 1)
table(d$client, d$client.check)
##    
##        0    1
##   0 1233    0
##   1    1 1235

It looks like most farmers were recorded correctly except for one farmer who was coded as Tubura farmer but their sample_id indicate their a control. Let’s take a look:

d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), c("sample_id", "tuburacontroltc")]
##     sample_id tuburacontroltc
## 990     2202C         Control
# they should be a control
d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), "client"] <- 0

# remove farmers for which we have soil data but no survey data (using)
d <- d[-which(grepl("using", d$X_merge)),]
table(d$client, d$client.check, useNA = 'ifany')
##    
##        0    1
##   0 1234    0
##   1    0 1235
# update client to equal client check
d$client <- d$client.check

Fix some more variable names:

names(d)[names(d)=="field_kg_fert1_1"] <- "field_kg_fert1_15b"
names(d)[names(d)=="field_kg_fert2_1"] <- "field_kg_fert2_15b"
names(d)[names(d)=="field_kg_compost"] <- "field_kg_compost_15b"

Recode variables to numeric:

# recode to numeric
varlist <- c("client", "own", "crop1_15b_seedkg", "crop1_15b_yield", "crop1_15b_yield_",
    "crop2_15b_seedkg", "crop2_15b_yield", "crop2_15b_yield_", "field_kg_fert1_15b",
    "field_kg_fert2_15b", "field_kg_compost_15b", "d_lime_15b", "kg_lime_15b")

# check that there aren't values hidden in the character variables
#apply(d[,varlist], 2, function(x){table(x, useNA='ifany')})

# recode characters to numerics
d[, varlist] <- sapply(d[,varlist], as.numeric)

Sort out kgs of lime applied

table(d$kg_lime_15b, useNA = 'ifany')
## 
##  -88    1    3    4    5   10   13   15   20   25   30   35   50   60   88 
##    1    2    2    2    6    5    1    1    2    5    1    1    8    1    2 
##  100  150  200 <NA> 
##    2    4    1 2422
d$kg_lime_15b <- ifelse(abs(d$kg_lime_15b)==88, NA, d$kg_lime_15b)
# divide out GPS coordinates
# http://rfunction.com/archives/1499

# replace the blank gps_pic_guide with info
d <- cbind(d, str_split_fixed(d$gps_pic_guid, " ", n=4))
names(d)[107:110] <- c("lat", "lon", "alt", "precision")
d[,c("lat", "lon", "alt", "precision")] <- sapply(d[,c("lat", "lon", "alt", "precision")],
                                                  function(x){as.numeric(as.character(x))})

Cleaning soil data

Cleaning of soil data: Come back, check and clean the soil data before outputting to clean data set. Plot each of the soil variables to look for unrealistic values.

dim(d[is.na(d$m3.Ca),])
## [1]  26 110
d <- d[-which(is.na(d$m3.Ca)),]
       
summary(d[,c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C")])
##      m3.Ca            m3.Mg               pH           Total.N       
##  Min.   : 220.5   Min.   :  38.26   Min.   :4.547   Min.   :0.05523  
##  1st Qu.: 443.0   1st Qu.: 114.16   1st Qu.:5.021   1st Qu.:0.13596  
##  Median : 720.2   Median : 184.98   Median :5.539   Median :0.15552  
##  Mean   : 876.4   Mean   : 207.82   Mean   :5.544   Mean   :0.15717  
##  3rd Qu.:1157.9   3rd Qu.: 270.26   3rd Qu.:6.009   3rd Qu.:0.17886  
##  Max.   :3669.2   Max.   :1008.93   Max.   :7.211   Max.   :0.24701  
##     Total.C      
##  Min.   :0.8988  
##  1st Qu.:1.7227  
##  Median :2.1118  
##  Mean   :2.1096  
##  3rd Qu.:2.3657  
##  Max.   :4.1620

Let’s check out the rows for which we don’t have soil data and drop them as they won’t contribute to the full picture.

Graphs of RW baseline soil variables

soilVars <- c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C")
for(i in 1:length(soilVars)){
print(
  ggplot(data=d, aes(x=as.factor(client), y=d[,soilVars[i]])) + 
    geom_boxplot() +
    labs(x="Tubura Farmer", y=soilVars[i], title = paste("RW baseline soil - ", soilVars[i], sep = ""))
  )  
}

Check out soil relationships

There are biologically predictable relationships between soil chemical characteristics. For instance, we expect Ca and Mg to move in the same direction and be positively correlated with pH. If we had Aluminum as an outcome, we’d expect pH to be negatively correlated with soluable aluminum. Let’s look quickly to confirm if those relationships are present:

ggplot(d, aes(x=m3.Ca, y=m3.Mg)) + geom_point() +
    stat_smooth(method="loess") +
    labs(x = "Calcium (m3)", y= "Magnesium (m3)", title="Calcium and Magnesium relationship")

ggplot(d, aes(x=pH, y=m3.Ca)) + geom_point() +
  stat_smooth(method="loess") +
  labs(x = "pH", y="Calcium (m3)", title = "pH and Calcium relationship")

ggplot(d, aes(x=pH, y=m3.Mg)) + geom_point() +
  stat_smooth(method="loess") +
  labs(x = "pH", y="Magnesium (m3)", title = "pH and Magnesium relationship")

ggplot(d, aes(x=Total.C, y=Total.N)) + geom_point() + 
  stat_smooth(method="loess") +
  labs(x = "Total Carbon", y="Total Nitrogen", title = "Carbon and Nitrogen relationship")

The soil characteristics are moving in the manner that is consistent with our understanding of soil chemical processes.

Save clean demographic and soil data to external file

write.csv(d, file=paste(dd, "shs rw baseline.csv", sep = "/"))
save(d, file=paste(dd, "shs rw baseline.Rdata", sep = "/"))

Map of baseline observations

Produce a simple map of where our observations are

See here for more on using markerClusterOptions in leaflet.

In the map below, the larger green circles are Tubura farmers and the smaller blue circles are control farmers.

e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e)

pal <- colorNumeric(c("navy", "green"), domain=unique(ss$client))
map <- leaflet() %>% addTiles() %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addCircleMarkers(lng=ss$lon, lat=ss$lat, 
                   radius= ifelse(ss$client==1, 10,6),
                   color = pal(ss$client),
clusterOptions = markerClusterOptions(disableClusteringAtZoom=13, spiderfyOnMaxZoom=FALSE))

map

Summary statistics

Table of final baseline breakdown

count <- d %>% group_by(district) %>% 
  dplyr::summarize(
    t.count = sum(ifelse(client==1,1,0)),
    c.count = sum(ifelse(client==0,1,0)),
    total = n()
  ) %>% ungroup()

count <- as.data.frame(count)
write.csv(count, file=paste(od, "final rw sample breakdown.csv", sep="/"), row.names=F)
as.data.frame(count)
##        district t.count c.count total
## 1   Gatsibo_LWH      58      59   117
## 2  Gatsibo_NLWH      87      81   168
## 3      Gisagara      46      46    92
## 4          Huye     114     115   229
## 5       Karongi     163     159   322
## 6       Kayonza      55      58   113
## 7      Mugonero     131     129   260
## 8     Nyamagabe      56      56   112
## 9    Nyamasheke     147     146   293
## 10       Nyanza      46      46    92
## 11    Nyaruguru      46      46    92
## 12      Rutsiro     165     167   332
## 13    Rwamagana     110     111   221

Baseline balance

Let’s see how balanced our farmers are in terms of demographic variables. Tubura farmers were selected based on (list criteria) and control farmers in the same area tha fit the same criteria were also selected. No matching process has been performed to identify the control farmers that most closely resemble the Tubura farmers in the sample. These results are entirely reflecting the balance inherent in the identification process, not any statistical matching of treatment and control.

out.list <- c("female", "age", "hhsize", "own", "field.size",
              "n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2", "m3.Ca", "m3.Mg", "pH", "Total.C", "Total.N")

output <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(d[,x] ~ d[,"client"], data=d)
  tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
output$pvalue <- p.adjust(output$pvalue, method="fdr")
rownames(output) <- out.list
output <- output[order(output$pvalue),]
output$pvalue <- ifelse(output[, 3] < 0.001, "< 0.001", round(output[, 3], 3)) 


colnames(output) <- c("Non-Tubura", "Tubura Client", "p-value") 
print(kable(output))
Non-Tubura Tubura Client p-value
n_season_fert 0.833 3.263 < 0.001
female 0.649 0.493 < 0.001
age 48.088 44.021 < 0.001
hhsize 4.894 5.418 < 0.001
n_seasons_leg_2 3.072 2.550 < 0.001
n_season_lime 0.132 0.228 < 0.001
own 0.957 0.932 0.02
n_season_compost 5.523 5.818 0.111
n_season_fallow 0.569 0.690 0.116
Total.N 0.158 0.156 0.116
m3.Ca 893.791 858.994 0.156
m3.Mg 211.819 203.847 0.156
field.size 601.671 667.734 0.19
Total.C 2.121 2.098 0.304
pH 5.555 5.533 0.352
n_seasons_leg_1 2.135 2.228 0.397
#write table
write.csv(output, file=paste(od, "baseline balance.csv", sep="/"), row.names=T)

Overall balance interpretation

Demographic variables We are not well balanced along the main demographic variables we collected, sex, age and HH size. For the purposes of inference we can test some matching algorithms to improve the match between Tubura and control farmers.

Agricultural practice variables We are decently balanced along agricultural practice variables. Our course of action here is similiar to our options with the demographic variables.

Soil Variables We are balanced on the primary soil variables of interest betwen our Tubura farmers and the comparison farmers.

Baseline balance by district

dist.output <- do.call(rbind, lapply(split(d, d$district), function(x) {
  
  tab <- do.call(rbind, lapply(out.list, function(y) {
    
    out <- t.test(x[,y] ~ x[,"client"], data=x)
    tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
    tab[,1:2] <- round(tab[,1:2],3)
    names(tab) <- c(names(out[[5]]), "pvalue")
    #tab[,3] <- p.adjust(tab[,3], method="holm")
    #tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
    #print(tab)
    return(tab)
  }))
  
  return(data.frame(district = unique(x$district), tab))
}))

rownames(dist.output) <- NULL
dist.output$variable <- rep(out.list,length(unique(d$district)))    

# order variables 
dist.output <- dist.output[, c(1, 5, 2:4)]
dist.output$pvalue <- p.adjust(dist.output$pvalue, method="fdr")
dist.output <- dist.output[order(dist.output$pvalue),]

dist.output$pvalue <- ifelse(dist.output$pvalue < 0.001, "< 0.001", round(dist.output$pvalue,3))
colnames(dist.output) <- c("District", "Varible", "Non-Tubura", "Tubura Client", "p-value") 
print(kable(dist.output))
District Varible Non-Tubura Tubura Client p-value
70 Karongi n_season_fert 0.447 3.785 < 0.001
102 Mugonero n_season_fert 0.597 4.313 < 0.001
182 Rutsiro n_season_fert 1.066 4.200 < 0.001
134 Nyamasheke n_season_fert 1.507 5.068 < 0.001
198 Rwamagana n_season_fert 1.054 2.645 < 0.001
54 Huye n_season_fert 0.322 2.246 < 0.001
22 Gatsibo_NLWH n_season_fert 0.296 1.517 < 0.001
118 Nyamagabe n_season_fert 1.554 4.000 < 0.001
136 Nyamasheke n_season_lime 0.041 0.327 < 0.001
86 Kayonza n_season_fert 0.776 2.073 < 0.001
194 Rwamagana age 52.072 44.245 0.001
131 Nyamasheke hhsize 4.685 5.701 0.003
65 Karongi female 0.667 0.466 0.004
163 Nyaruguru hhsize 4.717 6.217 0.004
45 Gisagara m3.Mg 241.171 190.036 0.006
50 Huye age 49.400 43.395 0.009
44 Gisagara m3.Ca 1203.823 895.928 0.01
145 Nyanza female 0.543 0.217 0.012
150 Nyanza n_season_fert 0.152 1.326 0.019
18 Gatsibo_NLWH age 46.210 39.897 0.019
17 Gatsibo_NLWH female 0.556 0.322 0.021
2 Gatsibo_LWH age 49.949 41.931 0.027
113 Nyamagabe female 0.750 0.482 0.03
81 Kayonza female 0.672 0.400 0.03
20 Gatsibo_NLWH own 1.000 0.908 0.031
104 Mugonero n_season_lime 0.023 0.176 0.031
193 Rwamagana female 0.739 0.555 0.031
1 Gatsibo_LWH female 0.492 0.241 0.034
179 Rutsiro hhsize 4.844 5.624 0.034
99 Mugonero hhsize 5.054 5.802 0.049
98 Mugonero age 49.140 44.290 0.067
161 Nyaruguru female 0.652 0.391 0.077
48 Gisagara Total.N 0.154 0.142 0.079
184 Rutsiro n_season_lime 0.096 0.327 0.082
47 Gisagara Total.C 2.005 1.816 0.086
135 Nyamasheke n_season_compost 5.521 6.544 0.089
138 Nyamasheke n_seasons_leg_1 1.658 2.327 0.103
166 Nyaruguru n_season_fert 1.065 2.130 0.107
35 Gisagara hhsize 4.326 5.348 0.116
6 Gatsibo_LWH n_season_fert 1.475 2.552 0.123
59 Huye n_seasons_leg_2 4.860 3.667 0.123
51 Huye hhsize 4.574 5.132 0.141
115 Nyamagabe hhsize 4.696 5.482 0.141
41 Gisagara n_season_fallow 0.087 0.500 0.184
43 Gisagara n_seasons_leg_2 3.283 2.370 0.184
107 Mugonero n_seasons_leg_2 4.031 3.131 0.185
148 Nyanza own 1.000 0.913 0.192
196 Rwamagana own 0.973 0.909 0.192
46 Gisagara pH 5.945 5.759 0.208
27 Gatsibo_NLWH n_seasons_leg_2 3.889 3.046 0.215
38 Gisagara n_season_fert 0.348 1.087 0.215
72 Karongi n_season_lime 0.013 0.098 0.221
155 Nyanza n_seasons_leg_2 3.478 2.130 0.221
88 Kayonza n_season_lime 0.707 0.527 0.222
201 Rwamagana n_season_fallow 0.712 1.218 0.222
114 Nyamagabe age 48.875 43.768 0.225
176 Nyaruguru Total.N 0.170 0.161 0.234
205 Rwamagana m3.Mg 293.168 273.526 0.293
146 Nyanza age 50.152 44.717 0.33
33 Gisagara female 0.609 0.435 0.333
66 Karongi age 47.384 44.681 0.333
95 Kayonza Total.C 2.556 2.400 0.338
82 Kayonza age 46.500 42.345 0.345
64 Huye Total.N 0.148 0.143 0.348
23 Gatsibo_NLWH n_season_compost 3.420 2.632 0.355
92 Kayonza m3.Ca 1863.601 1664.716 0.355
183 Rutsiro n_season_compost 7.593 8.176 0.355
139 Nyamasheke n_seasons_leg_2 2.681 2.137 0.431
185 Rutsiro n_season_fallow 0.287 0.473 0.431
49 Huye female 0.591 0.500 0.44
84 Kayonza own 0.897 0.964 0.44
85 Kayonza field.size 423.776 664.236 0.44
96 Kayonza Total.N 0.187 0.180 0.44
108 Mugonero m3.Ca 604.767 555.744 0.44
122 Nyamagabe n_seasons_leg_1 1.125 1.625 0.44
168 Nyaruguru n_season_lime 0.000 0.043 0.44
177 Rutsiro female 0.623 0.545 0.44
190 Rutsiro pH 5.338 5.264 0.44
207 Rwamagana Total.C 2.232 2.169 0.44
24 Gatsibo_NLWH n_season_lime 0.025 0.069 0.449
158 Nyanza pH 5.999 6.105 0.464
164 Nyaruguru own 0.935 0.848 0.466
174 Nyaruguru pH 5.210 5.319 0.472
71 Karongi n_season_compost 6.553 7.074 0.478
68 Karongi own 0.981 0.957 0.502
130 Nyamasheke age 48.123 45.898 0.502
156 Nyanza m3.Ca 987.213 1083.103 0.502
188 Rutsiro m3.Ca 683.185 631.247 0.502
11 Gatsibo_LWH n_seasons_leg_2 2.690 1.983 0.512
58 Huye n_seasons_leg_1 1.043 1.386 0.514
110 Mugonero pH 5.242 5.181 0.514
157 Nyanza m3.Mg 210.773 230.037 0.514
173 Nyaruguru m3.Mg 138.204 151.996 0.514
191 Rutsiro Total.C 2.304 2.240 0.514
203 Rwamagana n_seasons_leg_2 1.685 1.349 0.514
67 Karongi hhsize 4.912 5.172 0.52
175 Nyaruguru Total.C 2.229 2.140 0.531
142 Nyamasheke pH 5.123 5.179 0.55
3 Gatsibo_LWH hhsize 5.763 5.362 0.598
42 Gisagara n_seasons_leg_1 2.870 2.304 0.598
56 Huye n_season_lime 0.113 0.035 0.598
204 Rwamagana m3.Ca 1376.064 1298.489 0.598
165 Nyaruguru field.size 495.239 570.022 0.609
4 Gatsibo_LWH own 1.000 0.983 0.611
52 Huye own 0.991 0.974 0.611
93 Kayonza m3.Mg 357.168 336.861 0.611
100 Mugonero own 0.915 0.947 0.611
128 Nyamagabe Total.N 0.165 0.169 0.611
181 Rutsiro field.size 430.156 584.327 0.611
97 Mugonero female 0.713 0.656 0.615
79 Karongi Total.C 1.832 1.877 0.626
189 Rutsiro m3.Mg 174.194 163.840 0.626
8 Gatsibo_LWH n_season_lime 0.322 0.500 0.636
37 Gisagara field.size 559.261 493.957 0.64
94 Kayonza pH 6.203 6.129 0.64
192 Rutsiro Total.N 0.160 0.157 0.655
206 Rwamagana pH 5.873 5.817 0.656
5 Gatsibo_LWH field.size 916.441 776.345 0.657
127 Nyamagabe Total.C 2.266 2.336 0.657
101 Mugonero field.size 393.124 441.359 0.66
208 Rwamagana Total.N 0.178 0.175 0.662
25 Gatsibo_NLWH n_season_fallow 0.543 0.736 0.673
75 Karongi n_seasons_leg_2 3.548 3.216 0.688
129 Nyamasheke female 0.692 0.646 0.688
57 Huye n_season_fallow 0.487 0.675 0.69
133 Nyamasheke field.size 580.130 787.527 0.69
147 Nyanza hhsize 4.891 5.261 0.69
172 Nyaruguru m3.Ca 615.230 665.044 0.693
144 Nyamasheke Total.N 0.167 0.165 0.696
117 Nyamagabe field.size 282.643 310.929 0.705
34 Gisagara age 46.848 44.761 0.708
109 Mugonero m3.Mg 169.468 159.707 0.725
10 Gatsibo_LWH n_seasons_leg_1 4.678 4.345 0.725
69 Karongi field.size 446.088 490.788 0.725
77 Karongi m3.Mg 269.452 254.229 0.726
21 Gatsibo_NLWH field.size 1459.901 1306.069 0.75
55 Huye n_season_compost 4.878 5.211 0.75
12 Gatsibo_LWH m3.Ca 1139.996 1206.971 0.762
80 Karongi Total.N 0.142 0.144 0.762
90 Kayonza n_seasons_leg_1 2.155 1.982 0.762
153 Nyanza n_season_fallow 0.870 1.130 0.762
180 Rutsiro own 0.934 0.915 0.762
39 Gisagara n_season_compost 3.957 4.326 0.763
78 Karongi pH 5.477 5.441 0.763
199 Rwamagana n_season_compost 3.748 4.018 0.765
106 Mugonero n_seasons_leg_1 1.186 1.336 0.774
9 Gatsibo_LWH n_season_fallow 0.288 0.397 0.774
19 Gatsibo_NLWH hhsize 5.160 5.333 0.774
36 Gisagara own 0.870 0.826 0.774
91 Kayonza n_seasons_leg_2 1.259 1.091 0.774
116 Nyamagabe own 0.982 0.964 0.774
119 Nyamagabe n_season_compost 7.518 7.857 0.774
169 Nyaruguru n_season_fallow 0.261 0.370 0.774
178 Rutsiro age 44.617 43.685 0.774
120 Nyamagabe n_season_lime 0.393 0.500 0.785
61 Huye m3.Mg 190.397 186.200 0.794
141 Nyamasheke m3.Mg 147.629 153.726 0.794
53 Huye field.size 567.922 599.096 0.805
123 Nyamagabe n_seasons_leg_2 2.929 3.161 0.805
29 Gatsibo_NLWH m3.Mg 254.332 246.971 0.819
132 Nyamasheke own 0.945 0.932 0.819
140 Nyamasheke m3.Ca 590.765 613.771 0.819
167 Nyaruguru n_season_compost 6.043 5.696 0.819
149 Nyanza field.size 766.217 853.359 0.823
170 Nyaruguru n_seasons_leg_1 2.348 2.043 0.823
154 Nyanza n_seasons_leg_1 2.283 2.543 0.833
197 Rwamagana field.size 842.054 892.855 0.836
202 Rwamagana n_seasons_leg_1 1.568 1.682 0.836
126 Nyamagabe pH 5.022 5.002 0.841
200 Rwamagana n_season_lime 0.324 0.355 0.841
60 Huye m3.Ca 870.054 853.743 0.852
62 Huye pH 5.665 5.684 0.853
30 Gatsibo_NLWH pH 6.061 6.038 0.857
103 Mugonero n_season_compost 6.070 6.229 0.877
137 Nyamasheke n_season_fallow 0.664 0.599 0.877
76 Karongi m3.Ca 803.893 787.032 0.895
171 Nyaruguru n_seasons_leg_2 3.370 3.178 0.913
16 Gatsibo_LWH Total.N 0.146 0.148 0.916
13 Gatsibo_LWH m3.Mg 235.470 239.641 0.924
14 Gatsibo_LWH pH 6.119 6.138 0.924
15 Gatsibo_LWH Total.C 1.866 1.887 0.924
63 Huye Total.C 1.919 1.910 0.924
105 Mugonero n_season_fallow 0.915 0.863 0.924
121 Nyamagabe n_season_fallow 0.304 0.268 0.924
124 Nyamagabe m3.Ca 449.585 456.505 0.924
125 Nyamagabe m3.Mg 108.307 106.662 0.924
143 Nyamasheke Total.C 2.285 2.298 0.924
159 Nyanza Total.C 1.778 1.761 0.924
186 Rutsiro n_seasons_leg_1 3.479 3.388 0.924
28 Gatsibo_NLWH m3.Ca 1246.188 1230.161 0.928
195 Rwamagana hhsize 4.982 4.927 0.928
83 Kayonza hhsize 5.155 5.091 0.932
7 Gatsibo_LWH n_season_compost 5.780 5.672 0.943
112 Mugonero Total.N 0.153 0.154 0.943
89 Kayonza n_season_fallow 0.483 0.455 0.947
187 Rutsiro n_seasons_leg_2 2.078 2.036 0.947
160 Nyanza Total.N 0.134 0.134 0.957
31 Gatsibo_NLWH Total.C 1.994 2.002 0.959
151 Nyanza n_season_compost 3.326 3.261 0.966
32 Gatsibo_NLWH Total.N 0.158 0.157 0.977
26 Gatsibo_NLWH n_seasons_leg_1 1.556 1.540 0.978
73 Karongi n_season_fallow 0.843 0.834 0.978
74 Karongi n_seasons_leg_1 2.497 2.485 0.978
87 Kayonza n_season_compost 3.534 3.564 0.978
111 Mugonero Total.C 2.203 2.206 0.978
162 Nyaruguru age 48.304 48.457 0.978
40 Gisagara n_season_lime 0.022 0.022 1
152 Nyanza n_season_lime 0.000 0.000 NA

District balance interpretation

Demographic variables interpretation here.

Agricultural practice variables interpretation here

Soil Variables interpretation here

write.csv(dist.output, file=paste(od, "district balance.csv", sep="/"), row.names=T)

Baseline balance by Tubura tenure

Look at farmers by duration of tenure farming with Tubura. We want to understand, at least with an initial naive baseline sense, what is the cumulative effect of Tubura practices on soil health outcomes?

We will look only at current Tubura farmers and compare first year farmers to farmers with more experience with Tubura.

oafOnly <- d[which(d$client==0 & d$total.seasons>=1),]

nTenure <- oafOnly %>% group_by(total.seasons) %>% 
  summarize(
    n = n()
  ) %>% ungroup() %>% as.data.frame()
nTenure$val <- paste(nTenure$total.seasons, " (", "n = ", nTenure$n, ")", sep = "")

for(i in 1:length(soilVars)){
print(
  ggplot(oafOnly, aes(x=as.factor(total.seasons), y=oafOnly[,soilVars[i]])) + 
    geom_boxplot() +
    scale_x_discrete(labels=nTenure$val) + 
    theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x="Tubura Tenure", y=soilVars[i], title = paste("RW baseline soil by tenure - ", soilVars[i], sep = ""))
  )  
}

Tenure summaries

tenureSum <- aggregate(oafOnly[,out.list], by=list(oafOnly$total.seasons), function(x){
  round(mean(x, na.rm=T),2)
})
tenureSum <- as.data.frame(t(tenureSum))
colnames(tenureSum) <- c(paste(seq(1,11,1), " seas.", sep = ""), "13 seas.")
print(kable(tenureSum))
1 seas. 2 seas. 3 seas. 4 seas. 5 seas. 6 seas. 7 seas. 8 seas. 9 seas. 10 seas. 11 seas. 13 seas.
Group.1 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 13.00
female 0.70 0.61 0.67 0.65 0.64 0.58 0.75 0.33 0.33 0.75 0.67 0.00
age 48.77 46.43 46.46 48.96 42.73 43.83 42.12 52.33 36.00 48.50 42.00 52.00
hhsize 5.30 5.40 5.71 4.91 6.09 5.25 5.50 5.83 6.67 6.50 8.67 6.00
own 0.92 0.97 1.00 1.00 0.91 1.00 1.00 0.83 1.00 1.00 1.00 1.00
field.size 564.72 595.76 390.17 547.13 976.18 289.17 346.75 602.33 256.00 241.00 334.67 480.00
n_season_fert 1.03 1.27 3.17 1.65 1.82 2.83 2.38 5.33 2.67 3.75 0.33 10.00
n_season_compost 5.48 5.61 6.58 4.43 5.82 6.50 6.75 8.33 10.00 8.25 4.67 10.00
n_season_lime 0.27 0.17 0.25 0.09 0.18 0.33 0.00 0.17 0.00 0.00 0.00 0.00
n_season_fallow 0.53 0.74 0.58 1.00 0.45 0.50 0.88 0.00 0.00 0.00 1.33 0.00
n_seasons_leg_1 2.12 2.14 1.96 2.00 1.55 1.25 0.62 3.00 4.00 2.25 4.33 1.00
n_seasons_leg_2 3.13 2.85 4.50 2.57 1.90 2.33 5.88 3.50 3.33 5.25 0.00 9.00
m3.Ca 1127.45 946.96 1034.46 686.67 678.56 625.67 870.40 443.64 1153.25 483.06 746.88 632.20
m3.Mg 247.35 226.65 252.39 165.35 153.59 138.80 217.69 120.77 307.42 153.92 177.55 224.80
pH 5.72 5.56 5.70 5.20 5.34 5.34 5.64 5.10 5.87 5.18 5.50 5.54
Total.C 2.13 2.19 2.19 2.36 2.01 2.06 1.83 2.36 1.85 2.26 1.77 2.06
Total.N 0.16 0.16 0.16 0.17 0.15 0.15 0.14 0.16 0.15 0.15 0.13 0.16

Tenure balance

We’re defining Tubura tenure as having 3 or more seasons of experience farming with Tubura. We draw the line at 3 seasons as three seasons of fertilizer use is approximately when we’d expect fertilizer to start to have a detrimental effect on soil health.

oafOnly$tenured <- ifelse(oafOnly$total.seasons>=3,1,0)

tenure <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(oafOnly[,x] ~ oafOnly[,"tenured"], data=oafOnly)
  tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
tenure$pvalue <- p.adjust(tenure$pvalue, method="fdr")
rownames(tenure) <- out.list
tenure <- tenure[order(tenure$pvalue),]
tenure$pvalue <- ifelse(tenure[, 3] < 0.001, "< 0.001", round(tenure[, 3], 3)) 


colnames(tenure) <- c("Non-Tubura", "Tubura Client", "p-value") 
print(kable(tenure))
Non-Tubura Tubura Client p-value
n_season_fert 1.141 2.663 < 0.001
m3.Ca 1046.351 773.500 0.001
m3.Mg 238.052 189.232 0.004
pH 5.648 5.424 0.005
own 0.943 0.979 0.263
Total.N 0.162 0.156 0.263
n_season_compost 5.537 6.242 0.295
n_season_lime 0.225 0.158 0.364
age 47.718 45.979 0.44
hhsize 5.348 5.653 0.44
field.size 578.672 481.684 0.44
n_seasons_leg_2 3.004 3.426 0.44
female 0.661 0.621 0.618
n_seasons_leg_1 2.128 1.926 0.625
n_season_fallow 0.621 0.621 1
Total.C 2.154 2.150 1

Tenured v. new balance interpretation

Demographic variables We are well balanced along demographic variables.

Agricultural practice variables Not surprisingly, Tubura farmers have more cumulative years of fertilizer use than current non-Tubura farmers. While that difference is signficant, it is realistically only a single season of fertilizer use different.

Interestingly, non-Tubura farmers reported using more lime than current Tubura farmers. This

Soil Variables Soil pH, calcium and magnesium levels are lower for tenured Tubura farmers. This is consistent with the hypothesis that increaesd fertilizer use leads to an increaese in soil acidity.

Analysis of agronomic practices contributing to soil health outcomes

Here’s where we’ll look at the contribution of fertilizer, lime and cultivation practices on soil health outcomes. This analysis will be come richer as we gain longitudinal measures. I caution that we cannot treat these relationships as causal. The direction of causality is not clearly delineated in the data or the study design. However, we can identify meaningful connections between practices and outcomes through this analysis to generate new hypotheses for field testing.

I’m going to start with behaviors by sections and then move to a more comprehensive model including multiple practices. All models will include controls for site to account for local variation and field officer behavior.

Agronomic Behaviors

Check for multicollinearity before adding number of seasons of agronomic inputs on the same side of the regression.

suppressMessages(library(stargazer))

inputUse <- c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow")

cor(d[,inputUse], use="complete.obs")
##                  n_season_fert n_season_compost n_season_lime
## n_season_fert       1.00000000       0.35646933    0.17075604
## n_season_compost    0.35646933       1.00000000    0.03828723
## n_season_lime       0.17075604       0.03828723    1.00000000
## n_season_fallow    -0.06439632      -0.16923273   -0.01116383
##                  n_season_fallow
## n_season_fert        -0.06439632
## n_season_compost     -0.16923273
## n_season_lime        -0.01116383
## n_season_fallow       1.00000000

Interpretation: The strongest correlation between the input use intensity variables is between seasons of fertilizer and compost use, ~0.35. While this is on the higher end it’s not necessarily cause for alarm.

inputUse <- paste(c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow"), collapse= " + ")

list1 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", inputUse, "+ as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list1, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer", "Seasons of Compost", 
                               "Seasons of Lime", "Seasons of Fallow"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_agprac.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Seasons of Fertilizer -5.351 -1.230* -0.001 -0.0004** -0.003
(3.266) (0.703) (0.003) (0.0002) (0.003)
Seasons of Compost 1.364 0.363 0.003 0.0002 0.002
(2.635) (0.567) (0.003) (0.0001) (0.003)
Seasons of Lime 41.646*** 8.175*** 0.022 0.002** 0.023
(14.233) (3.065) (0.014) (0.001) (0.015)
Seasons of Fallow -19.729*** -3.674*** -0.028*** 0.0004 0.009
(5.196) (1.119) (0.005) (0.0003) (0.005)
Constant 589.376 63.444 4.674*** 0.200*** 3.298***
(381.191) (82.085) (0.366) (0.021) (0.390)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.557 0.592 0.616 0.502 0.458
Adjusted R2 0.517 0.555 0.581 0.457 0.409
Residual Std. Error (df = 2238) 381.045 82.054 0.366 0.021 0.390
F Statistic (df = 204; 2238) 13.809*** 15.941*** 17.628*** 11.059*** 9.277***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation The naive model suggests that when we include site level fixed effects, duration of agronomic practices don’t have a big effect on soil health outcomes. However, some of the practice intensity variables are not well distributed. Let’s take a look at a log transformation. I’m adding one to the variables as to not end up with lots of Inf values.

Log transformations in theory are appropriate for variables that are right skewed (vavlues clustered to the left of the distribution) and see diminishing returns to increasing values. The shape of the data suggests a log transformation but it’s debateable whether the relationship is diminishing.

agPrac <- c(names(d[grep('n_season_', names(d))]))

for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=d[,agPrac[i]])) + geom_density() +
      labs(x = paste(agPrac[i], " No transform", sep = ""))
        )
}

# since these are all skewed, consider a log transform 
for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=log10(d[,agPrac[i]]+1))) + geom_density() + 
      labs(x = paste(agPrac[i], " Log transform", sep = ""))
        )
}

# look at other transfomations
for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=d[,agPrac[i]]^(1/3))) + geom_density() +
      labs(x = paste(agPrac[i], " cubic root transform", sep = ""))
        )
}

# visualize the outcomes as well to see if a transformation is warranted
for(i in 1:length(soilVars)){
  print(
    ggplot(d, aes(x=d[,soilVars[i]])) + geom_density() +
      labs(x = soilVars[i], title = soilVars[i])
        )
}

d$logFert <- log(d$n_season_fert+1)
d$logCompost <- log(d$n_season_compost+1)
d$logLime <- log(d$n_season_lime+1)
d$logFallow <- log(d$n_season_fallow+1)

Or look at BoxCox graph to empirically determine the right transformation. Log is assuming a diminishing return to an increasing X. That’s probably not the case with fertilizer. We’d actually expect an increasing return as values get larger. We use boxcox to see what the data suggest. We interpret it as follows:

  • lambda = 2 -> square
  • lambda = 1 -> no transformation
  • lambda = 0.5 -> square root
  • lambda = 0 -> log
  • lamdba = -1 -> inverse
library(MASS)
for(i in 1:length(agPrac)){
boxcox(lm(pH ~ d[,agPrac[i]], data=d))
}

For pH at least it seems like a log transform is appropriate. We can run this for all other variables as well to see what we get as well.

Let’s look at the log results: See here and here for guidance on intepreting log transformed right hand side variables. See here for additional guidance on choosing a transformation.

How to interpret RHS log transform: For a linear multivariate OLS regression, we say “a one unit increase in X causes a (coefficient) change in Y.” For a linear-log regression where the X variable is log transformed, we say a L percent change in X leads to a (coefficient*L) change in Y.

logVars <- paste(names(d[grep("log", names(d))]), collapse=" + ")

list2 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ as.factor(cell)", sep="")), data=d)
  return(mod)
})

list2b <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, sep="")), data=d)
  return(mod)
})

suppressWarnings(
  stargazer(list2, list2b, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Log Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)"),
          column.labels = c(rep(gsub("m3.","", soilVars),2)),
          dep.var.caption = "",
          dep.var.labels = c("",""),
          add.lines = list(c("Cell FE?", rep("Yes", 5), rep("No", 5))),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_agprac_log.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - Log Agronomic Practice Models
Ca Mg pH Total.N Total.C Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10)
Seasons of Fertilizer (log) -23.685** -5.633** -0.008 -0.002** -0.013 -53.105*** -7.185** -0.046*** -0.002** -0.017
(11.366) (2.449) (0.011) (0.001) (0.012) (14.518) (3.312) (0.015) (0.001) (0.014)
Seasons of Compost (log) 7.778 1.726 0.016 0.001 0.008 -96.407*** -14.142*** -0.111*** 0.0003 0.028**
(11.964) (2.578) (0.011) (0.001) (0.012) (14.382) (3.281) (0.015) (0.001) (0.014)
Seasons of Lime (log) 122.237*** 22.996*** 0.063** 0.005*** 0.068** 312.150*** 41.254*** 0.204*** 0.014*** 0.188***
(32.098) (6.917) (0.031) (0.002) (0.033) (38.120) (8.697) (0.039) (0.002) (0.036)
Seasons of Fallow (log) -72.103*** -13.697*** -0.103*** 0.001 0.021 -125.822*** -26.255*** -0.169*** 0.002 0.034*
(15.227) (3.281) (0.015) (0.001) (0.016) (19.660) (4.485) (0.020) (0.001) (0.019)
Constant 582.312 62.119 4.662*** 0.199*** 3.293*** 1,075.500*** 239.116*** 5.785*** 0.156*** 2.047***
(380.303) (81.949) (0.365) (0.021) (0.390) (26.342) (6.010) (0.027) (0.001) (0.025)
Cell FE? Yes Yes Yes Yes Yes No No No No No
Observations 2,443 2,443 2,443 2,443 2,443 2,443 2,443 2,443 2,443 2,443
R2 0.560 0.594 0.620 0.502 0.458 0.061 0.030 0.060 0.021 0.014
Adjusted R2 0.520 0.557 0.585 0.457 0.409 0.060 0.028 0.058 0.019 0.012
Residual Std. Error 379.815 (df = 2238) 81.843 (df = 2238) 0.364 (df = 2238) 0.021 (df = 2238) 0.390 (df = 2238) 531.648 (df = 2438) 121.288 (df = 2438) 0.549 (df = 2438) 0.029 (df = 2438) 0.504 (df = 2438)
F Statistic 13.970*** (df = 204; 2238) 16.079*** (df = 204; 2238) 17.886*** (df = 204; 2238) 11.078*** (df = 204; 2238) 9.285*** (df = 204; 2238) 39.691*** (df = 4; 2438) 18.661*** (df = 4; 2438) 38.608*** (df = 4; 2438) 13.070*** (df = 4; 2438) 8.376*** (df = 4; 2438)
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell
# a 10 percent increase in x leads to B1*.1 change in Y
logTrans <- do.call(cbind, lapply(list2, function(x){
  coeff = x$coefficients[2:5]
  tenPercent = round(coeff*.1, 5)
  names(tenPercent) <- paste("10% increase in ", gsub("log","", names(tenPercent)), " leads to:", sep="")
  return(tenPercent)
}))
colnames(logTrans) <- soilVars
print(kable(logTrans))
m3.Ca m3.Mg pH Total.N Total.C
10% increase in Fert leads to: -2.36850 -0.56333 -0.00079 -0.00015 -0.00132
10% increase in Compost leads to: 0.77779 0.17263 0.00158 0.00007 0.00076
10% increase in Lime leads to: 12.22368 2.29962 0.00629 0.00050 0.00676
10% increase in Fallow leads to: -7.21034 -1.36970 -0.01026 0.00009 0.00210

Interpretation: When we transform the variables to log, the data starts to tell a more coherent story, at least directionally. If we remove the district FE, the coefficients gain significance.

  • Additional seasons of fertilizer use relates to a decrease in soil N and C.
  • Seasons of compost and lime have the opposite, positive effect on soil N.
  • Unsurprisingly, soil health metrics are poorer for soils on which more fertilizer has been used and lime has not been used.

Tenure with One Acre Fund

Let’s look first at a naive model of One Acre Fund tenure on soil health. Remember: these data are not longitudinal! These data are not longitudinal and reflect farmer selection into One Acre Fund. While these models will try to be both robust and parsimonious, we will inevitabily suffer omitted variable bias due to a lack of an instrument.

list3 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  "total.seasons + as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list3, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Tenure Models",
          covariate.labels = c("OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Tenure Models
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
OAF Tenure -0.788 -0.122 0.002 -0.0002 -0.001
(2.810) (0.604) (0.003) (0.0002) (0.003)
Constant 596.407 65.140 4.683*** 0.201*** 3.308***
(382.946) (82.379) (0.369) (0.021) (0.390)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.552 0.589 0.610 0.500 0.457
Adjusted R2 0.512 0.552 0.575 0.455 0.408
Residual Std. Error (df = 2241) 382.905 82.370 0.369 0.021 0.390
F Statistic (df = 201; 2241) 13.757*** 15.955*** 17.423*** 11.155*** 9.377***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation: The naive One Acre Fund tenure model suggest that across the board that additional years of 1AF practices have a negative effect on soil health parameters. Let’s combine 1AF tenure with the agronomic practices model above to build a more robust model:

list4 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ total.seasons + as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list4, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Ag Practice and Tenure",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)", "OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_ag_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Ag Practice and Tenure
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Seasons of Fertilizer (log) -28.649** -7.229** -0.017 -0.001* -0.011
(13.713) (2.955) (0.013) (0.001) (0.014)
Seasons of Compost (log) 7.871 1.756 0.016 0.001 0.008
(11.966) (2.578) (0.011) (0.001) (0.012)
Seasons of Lime (log) 122.500*** 23.081*** 0.063** 0.005*** 0.068**
(32.105) (6.917) (0.031) (0.002) (0.033)
Seasons of Fallow (log) -72.522*** -13.831*** -0.103*** 0.001 0.021
(15.242) (3.284) (0.015) (0.001) (0.016)
OAF Tenure 2.215 0.712 0.004 -0.0001 -0.001
(3.423) (0.737) (0.003) (0.0002) (0.004)
Constant 577.732 60.646 4.654*** 0.199*** 3.295***
(380.418) (81.964) (0.365) (0.021) (0.390)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.560 0.595 0.620 0.503 0.458
Adjusted R2 0.520 0.557 0.585 0.457 0.409
Residual Std. Error (df = 2237) 379.864 81.845 0.364 0.021 0.390
F Statistic (df = 205; 2237) 13.900*** 16.005*** 17.810*** 11.022*** 9.236***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation: Including agronomic practices and 1AF tenure in the same model dampens the magnitude, but not the significance, of 1AF tenure on soil health outcomes.

Agronomic practices - 15B cultivation practices

Thus far we have looked at aggregated historical plot level practices and their effect on soil health. We also asked farmers about their cultivation practices on their plot in the previous season, 15B. We have more precise information for fertilizer, compost and liming practices for the 15B season.

# scale all the field application variables to ares
d$ares <- d$field.size/100
d$fert1.are <- d$field_kg_fert1_15b/d$ares
d$fert2.are <- d$field_kg_fert2_15b/d$ares
d$compost.are <- d$field_kg_compost_15b/d$ares
d$lime.are <- d$kg_lime_15b/d$ares

intensityVars <- c("fert1.are", "fert2.are",
                   "compost.are", "lime.are")

cor(d[,intensityVars], use="complete.obs")
##             fert1.are fert2.are compost.are  lime.are
## fert1.are   1.0000000 0.9889407   0.5216134 0.8679373
## fert2.are   0.9889407 1.0000000   0.5328874 0.8539114
## compost.are 0.5216134 0.5328874   1.0000000 0.6236616
## lime.are    0.8679373 0.8539114   0.6236616 1.0000000

Graph the application/are variables

for(i in 1:length(intensityVars)){
  print(
  ggplot(d, aes(x=d[,intensityVars[i]])) + geom_density() +
      labs(x = intensityVars[i], title = intensityVars[i])
  )
}
## Warning: Removed 1800 rows containing non-finite values (stat_density).

## Warning: Removed 2233 rows containing non-finite values (stat_density).

## Warning: Removed 143 rows containing non-finite values (stat_density).

## Warning: Removed 2399 rows containing non-finite values (stat_density).

Conclusion - Take 1: The application rate per are variables are weird. I think it’s because of the field dimensions. I’m going to go back to the field dimensions and check this.

Let’s look at the dimensions of the fields that have large application rates

d[d$fert1.are>10 & !is.na(d$fert1.are), c("field_dim1", "field_dim2", "field.size", "fert1.are")]
##      field_dim1 field_dim2 field.size fert1.are
## 828          48         12        576  15.27778
## 837          10         25        250  35.20000
## 936           5          8         40  25.00000
## 946          25         25        625  14.08000
## 1158          4         20         80  12.50000
## 2380          3          5         15  13.33333
d[d$fert2.are>10 & !is.na(d$fert2.are), c("field_dim1", "field_dim2", "field.size", "fert2.are")]
##     field_dim1 field_dim2 field.size fert2.are
## 828         48         12        576  15.27778
## 830          8         35        280  31.42857
## 837         10         25        250  35.20000
## 877         10          5         50  42.00000
d[d$compost.are>500 & !is.na(d$compost.are), c("field_dim1", "field_dim2", "field.size", "compost.are")]
##      field_dim1 field_dim2 field.size compost.are
## 93           13          8        104    769.2308
## 94            5          4         20   1000.0000
## 181           6          6         36    833.3333
## 205          10          5         50   1000.0000
## 383           2         20         40    750.0000
## 764          26          1         26    807.6923
## 885          15          2         30    666.6667
## 936           5          8         40   1250.0000
## 1128         15          6         90   2223.3333
## 1525         12        123       1476   1507.4526
## 1627         33          3         99    606.0606
## 2366          7         40        280   1714.6429
## 2380          3          5         15    666.6667
# there's a field that is 1 meter wide? Surely not.

d[abs(d$lime.are)>40 & !is.na(d$lime.are), c("field_dim1", "field_dim2", "field.size", "lime.are")]
##      field_dim1 field_dim2 field.size lime.are
## 1101         11         12        132 113.6364
## 1119         25         10        250  60.0000
# how is there a negative quantity of lime?
# this should be kg of fertilizer used in this field. Compost is off the charts. Convert this to compost per sq meter
previousSeason <- paste("fert1.are", "as.factor(cell)", sep=" + ")

list5 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeason, sep="")), data=d)
  return(mod)
})
stargazer(list5, type="html", 
          title = "2016A Rwanda Soil Health Baseline - 15B practices",
          covariate.labels = c("Fertilizer Rate (log)"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_15b_ag.htm", sep="/"))
2016A Rwanda Soil Health Baseline - 15B practices
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Fertilizer Rate (log) -1.683 -1.015 -0.005 -0.0004 -0.005
(7.659) (1.801) (0.008) (0.0005) (0.010)
Constant 1,574.543*** 324.301*** 5.911*** 0.192*** 2.399***
(162.897) (38.316) (0.175) (0.011) (0.205)
Observations 643 643 643 643 643
R2 0.703 0.717 0.686 0.607 0.544
Adjusted R2 0.591 0.611 0.569 0.460 0.373
Residual Std. Error (df = 467) 325.048 76.456 0.350 0.021 0.408
F Statistic (df = 175; 467) 6.305*** 6.772*** 5.836*** 4.125*** 3.181***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Farmer fertility perception

Let’s look at farmer perceived fertility as a predictor of soil health. We’ll set ‘same fertility’ as the reference category.

d$fertility_qual <- relevel(as.factor(d$general_field_infocompare_fertil), ref="same")

list6 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
  return(mod)
})


stargazer(list6, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Farmer Opinion - Less Fertile -112.045*** -19.157*** -0.121*** -0.0004 0.003
(21.335) (4.601) (0.021) (0.001) (0.022)
Farmer Opinion - More Fertile 5.575 1.536 -0.002 -0.001 -0.003
(23.554) (5.079) (0.023) (0.001) (0.024)
Constant 706.875* 84.054 4.808*** 0.201*** 3.302***
(380.991) (82.158) (0.366) (0.021) (0.391)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.558 0.592 0.616 0.500 0.457
Adjusted R2 0.519 0.555 0.581 0.455 0.408
Residual Std. Error (df = 2240) 380.393 82.029 0.366 0.021 0.390
F Statistic (df = 202; 2240) 14.022*** 16.105*** 17.797*** 11.085*** 9.325***
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

Interpretation: Farmers understand their fields well. Their categorization of which field are more and less fertile corresponds to our quantified measures of soil health. The only features farmers don’t seem to get correct are nitrogen and carbon. The nitrogen and carbon levels are indistinguishable in ‘low fertility’ fields relative to the fields deemed to be the ‘same fertility.’ Reminder: We need to remember that farmers are only evaluting one of their fields thus we are not able to account for the quality of the farmer in assessing his/her fields.

Deeper dive into farmer perception of fertility

  • Look at wet chem values for all soil features by farmer perception
  • Run PCA on all soil attributes to see which principle components predict soil fertility
  • Consider adding additional control variables into model (like Ca as control for pH perception model)
# merge wetChem in with d
names(wetChem)[2:21] <- paste("wet.", names(wetChem)[2:21], sep = "")
d <- left_join(d, wetChem, by="SSN")
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
wetVars <- names(d)[grep("wet.", names(d))]
for(i in 1:length(wetVars)){
  print(
  ggplot(data=d, aes(x=as.factor(client), y=d[,wetVars[i]])) + 
    geom_boxplot() +
    labs(x="Tubura Farmer", y=wetVars[i], title = paste("RW baseline wet chem - ", wetVars[i], sep = ""))
  )
}
## Warning: Removed 2206 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2206 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

list7 <- lapply(wetVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
  return(mod)
})

suppressWarnings(
stargazer(list7, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("wet.","", wetVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)
pH EC..S. P K Ca Mg S Na Fe Mn B Cu Zn C.E.C TN C.N Exch..Acidity Acid.Saturation Exch.Al OC
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19) (20)
Farmer Opinion - Less Fertile -0.227 1.930 -9.111 -44.922 -23.678 -22.846 2.274 -0.079 22.320 -29.748 -0.064 -0.317 -0.224 0.138 0.004 -0.277 0.343** 9.657** 0.268** 0.153
(0.147) (11.629) (19.670) (28.712) (150.522) (36.856) (2.236) (2.929) (17.023) (21.301) (0.222) (0.337) (0.764) (1.126) (0.008) (0.627) (0.160) (3.686) (0.124) (0.135)
Farmer Opinion - More Fertile -0.109 -7.216 -16.525 -69.198** -187.493 -57.688 2.223 -0.854 -12.790 -10.391 -0.069 -0.278 -1.040 -1.369 -0.016* 0.287 0.125 3.831 0.091 -0.192
(0.174) (13.775) (23.740) (34.653) (181.669) (44.482) (2.699) (3.535) (20.546) (25.708) (0.268) (0.407) (0.922) (1.359) (0.010) (0.757) (0.193) (4.449) (0.150) (0.163)
Constant 6.225*** 127.000*** 8.910 213.000*** 1,745.000*** 435.000*** 15.700** 24.300*** 61.450 194.000*** 0.415 2.670*** 4.415** 15.650*** 0.195*** 11.750*** 0.140 1.100 0.023 2.335***
(0.406) (32.067) (55.599) (81.159) (425.475) (104.178) (6.321) (8.278) (48.119) (60.210) (0.627) (0.953) (2.160) (3.182) (0.023) (1.772) (0.451) (10.419) (0.351) (0.383)
Observations 237 237 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244
R2 0.742 0.580 0.702 0.653 0.760 0.755 0.729 0.705 0.769 0.688 0.525 0.693 0.540 0.752 0.773 0.671 0.775 0.777 0.797 0.731
Adjusted R2 0.425 0.064 0.348 0.240 0.475 0.463 0.407 0.354 0.494 0.318 -0.039 0.329 -0.006 0.457 0.504 0.280 0.506 0.511 0.555 0.412
Residual Std. Error 0.574 (df = 106) 45.349 (df = 106) 78.629 (df = 111) 114.776 (df = 111) 601.713 (df = 111) 147.330 (df = 111) 8.940 (df = 111) 11.707 (df = 111) 68.051 (df = 111) 85.150 (df = 111) 0.887 (df = 111) 1.347 (df = 111) 3.055 (df = 111) 4.501 (df = 111) 0.033 (df = 111) 2.506 (df = 111) 0.638 (df = 111) 14.735 (df = 111) 0.496 (df = 111) 0.541 (df = 111)
F Statistic 2.344*** (df = 130; 106) 1.125 (df = 130; 106) 1.983*** (df = 132; 111) 1.580*** (df = 132; 111) 2.664*** (df = 132; 111) 2.586*** (df = 132; 111) 2.264*** (df = 132; 111) 2.010*** (df = 132; 111) 2.796*** (df = 132; 111) 1.857*** (df = 132; 111) 0.930 (df = 132; 111) 1.902*** (df = 132; 111) 0.988 (df = 132; 111) 2.551*** (df = 132; 111) 2.867*** (df = 132; 111) 1.715*** (df = 132; 111) 2.888*** (df = 132; 111) 2.923*** (df = 132; 111) 3.293*** (df = 132; 111) 2.290*** (df = 132; 111)
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

Interpretation: Our sample size decreases considerably when we look only at wet chemistry results. Thus, we do not see the same signficant relationships we saw between farmer perceived fertility and soil characteristics we saw when looking at the predicted values.

Principal Component Analysis (predicted values)

pca1 <- prcomp(d[,soilVars], scale=TRUE, center=TRUE)
print(pca1)
## Standard deviations:
## [1] 1.6054877 1.4130381 0.4829896 0.3401430 0.2770491
## 
## Rotation:
##                 PC1        PC2         PC3         PC4         PC5
## m3.Ca    0.59320505 -0.1382841  0.17987753 -0.19239465 -0.74807329
## m3.Mg    0.57530571 -0.0363033 -0.77066159  0.02943037  0.27003700
## pH       0.55976522  0.2080885  0.58530931  0.27208080  0.47617971
## Total.N  0.05235072 -0.6836420  0.17634506 -0.60421041  0.36568460
## Total.C -0.03245639 -0.6847572  0.00634184  0.72320119 -0.08363026
plot(pca1)

pca1$rotation
##                 PC1        PC2         PC3         PC4         PC5
## m3.Ca    0.59320505 -0.1382841  0.17987753 -0.19239465 -0.74807329
## m3.Mg    0.57530571 -0.0363033 -0.77066159  0.02943037  0.27003700
## pH       0.55976522  0.2080885  0.58530931  0.27208080  0.47617971
## Total.N  0.05235072 -0.6836420  0.17634506 -0.60421041  0.36568460
## Total.C -0.03245639 -0.6847572  0.00634184  0.72320119 -0.08363026

The first principal component is composed primarily of soil pH related variables, Ca, Mg, and pH. The second capture N and C. These groupings (pH grouping and C and N) are not all that surprising given that our predicted soil variable set is fairly limited.

If the variables have the same sign that indicates that they are positively correlated in the principal component. In the first principal component, we see Ca, Mg and pH loading in the same direction. In the second principal component, we see N and C moving in the same direction. Ca and Mg are also associated in the same direction but to a lesser extent.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$fertility_qual)) + geom_point() + 
  labs(title = "PCA of soil attributes and farmer perceived soil fertility",
       x = "First PCA", y= "Second PCA", color="Field Quality")

When we layer farmer perceived soil fertility on top of the principal component scatter plot, no clear pattern emergres. Fields with the same fertilitiy, less and more fertility are indistinguishable by the principal components. Thus, there is not a clear profile for farmer identified healthier soil or weaker soil based on principal components.

Geographic Descriptors - PCA for soil profiles

This section will build on the principal component work above and look at improving understanding of local context to inform local adaptation. Here we will also construct soil profiles to simplify the scaling of promising products and practices to targeted locations. We don’t have a full suite of predictors so we can’t look at a comprehensive soil profile.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$district)) + geom_point() + 
  labs(title = "PCA of soil attributes and district",
       x = "First PCA", y= "Second PCA", color="District")

When we color the figure by district, we start to see a pattern emerge. However, there are too many points to clearly detect where all districts fall. Let’s instead look at the data by AEZ.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$aez)) + geom_point() + 
  labs(title = "PCA of soil attributes and AEZ",
       x = "First PCA - Ca/Mg/pH", y= "Second PCA - N and C", color="AEZ")

Coloring the points by AEZ, we see a much clearer trend. The east is to the right of our graphic, then the central plateau followed by lake Kive and then Congo Nile in green on the left. What does this mean in terms of actual soil features? Let’s look at the figure but with the variable associations layered on top. See here for documentation

library(pca3d)
pca2d( pca1, biplot= TRUE, shape= 19, col= "black"  )

It appears that the right side of the graph, the eastern AEZ, is associated with pH, Mg and Ca. This indicates that as the first principal component increases, so does the level of pH, Ca and Mg. Conversely, total N and C increase as the second principal component decreases. In terms of the AEZ figure above, this suggest that the Congo Nile AEZ has higher levels of N and C. Let’s test these hypotheses with a simple summary table:

print(kable(aggregate(d[,soilVars], by=list(d$aez), function(x){
  round(mean(x, na.rm=T),5)
})))
Group.1 m3.Ca m3.Mg pH Total.N Total.C
Central Plateau 850.7738 191.0218 5.64841 0.14569 1.89463
Congo Nile 480.7509 122.1544 5.02397 0.16675 2.36796
East 1360.7312 277.7374 6.01518 0.16715 2.13319
Lake Kivu 803.3210 236.7244 5.47929 0.14917 2.04464

Confirmed! This likely also suggests that Congo Nile is at a higher altitude than the surrounding areas and that eastern Rwanda has relatively less weathered soils compared to western.

Consequence for trial placement: The Rwanda program is already blocking trials by AEZ but these data confirm that AEZ reflects meaningfu soil variation and thus captures key growing conditions for Rwandan farmers. Blocking trials by AEZ in Rwanda will enable us to evaluate trial hypotheses in more neutral pH ranges and higher N and C conditions.

Principal Component Analysis (wet chemistry values)

wetVal <- d[complete.cases(d[,wetVars]),]

pca2 <- prcomp(wetVal[,c("wet.C.E.C", "wet.pH", "wet.Mg", "wet.Ca")], center=TRUE, scale=TRUE)

pca2 <- prcomp(wetVal[, wetVars], center=TRUE, scale=TRUE)
#plot(pca2)
ggplot(as.data.frame(pca2$x),aes(x=PC1,y=PC2, color=as.factor(wetVal$aez))) + geom_point() +
  labs(title = "Wet Chem PCA with AEZ", x = "First PC", y="Second PC",
       color="AEZ")

#pca2$rotation
pca2d(pca2, biplot= TRUE, shape= 19, col= "black")

# put pca2$x PC1 in the main data to run the fertility perception model.
#mod8 <- lm(
suppressWarnings(
stargazer(list7, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("wet.","", wetVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)
pH EC..S. P K Ca Mg S Na Fe Mn B Cu Zn C.E.C TN C.N Exch..Acidity Acid.Saturation Exch.Al OC
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19) (20)
Farmer Opinion - Less Fertile -0.227 1.930 -9.111 -44.922 -23.678 -22.846 2.274 -0.079 22.320 -29.748 -0.064 -0.317 -0.224 0.138 0.004 -0.277 0.343** 9.657** 0.268** 0.153
(0.147) (11.629) (19.670) (28.712) (150.522) (36.856) (2.236) (2.929) (17.023) (21.301) (0.222) (0.337) (0.764) (1.126) (0.008) (0.627) (0.160) (3.686) (0.124) (0.135)
Farmer Opinion - More Fertile -0.109 -7.216 -16.525 -69.198** -187.493 -57.688 2.223 -0.854 -12.790 -10.391 -0.069 -0.278 -1.040 -1.369 -0.016* 0.287 0.125 3.831 0.091 -0.192
(0.174) (13.775) (23.740) (34.653) (181.669) (44.482) (2.699) (3.535) (20.546) (25.708) (0.268) (0.407) (0.922) (1.359) (0.010) (0.757) (0.193) (4.449) (0.150) (0.163)
Constant 6.225*** 127.000*** 8.910 213.000*** 1,745.000*** 435.000*** 15.700** 24.300*** 61.450 194.000*** 0.415 2.670*** 4.415** 15.650*** 0.195*** 11.750*** 0.140 1.100 0.023 2.335***
(0.406) (32.067) (55.599) (81.159) (425.475) (104.178) (6.321) (8.278) (48.119) (60.210) (0.627) (0.953) (2.160) (3.182) (0.023) (1.772) (0.451) (10.419) (0.351) (0.383)
Observations 237 237 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244
R2 0.742 0.580 0.702 0.653 0.760 0.755 0.729 0.705 0.769 0.688 0.525 0.693 0.540 0.752 0.773 0.671 0.775 0.777 0.797 0.731
Adjusted R2 0.425 0.064 0.348 0.240 0.475 0.463 0.407 0.354 0.494 0.318 -0.039 0.329 -0.006 0.457 0.504 0.280 0.506 0.511 0.555 0.412
Residual Std. Error 0.574 (df = 106) 45.349 (df = 106) 78.629 (df = 111) 114.776 (df = 111) 601.713 (df = 111) 147.330 (df = 111) 8.940 (df = 111) 11.707 (df = 111) 68.051 (df = 111) 85.150 (df = 111) 0.887 (df = 111) 1.347 (df = 111) 3.055 (df = 111) 4.501 (df = 111) 0.033 (df = 111) 2.506 (df = 111) 0.638 (df = 111) 14.735 (df = 111) 0.496 (df = 111) 0.541 (df = 111)
F Statistic 2.344*** (df = 130; 106) 1.125 (df = 130; 106) 1.983*** (df = 132; 111) 1.580*** (df = 132; 111) 2.664*** (df = 132; 111) 2.586*** (df = 132; 111) 2.264*** (df = 132; 111) 2.010*** (df = 132; 111) 2.796*** (df = 132; 111) 1.857*** (df = 132; 111) 0.930 (df = 132; 111) 1.902*** (df = 132; 111) 0.988 (df = 132; 111) 2.551*** (df = 132; 111) 2.867*** (df = 132; 111) 1.715*** (df = 132; 111) 2.888*** (df = 132; 111) 2.923*** (df = 132; 111) 3.293*** (df = 132; 111) 2.290*** (df = 132; 111)
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

Measured Yields

We don’t have measured yield data for the Rwanda baseline. Skip this for now.

Distributions of soil health levels for different sub-groups

Mapping

Soil health maps for soil health study areas

#crdref <- CRS('+proj=longlat +datum=WGS84')
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)

rw <- getData("GADM", country='RW', level=3, path = "/Users/mlowes/drive/soil health study/data") # the higher the number, the higher the resolution

#ext.rw.ss matches points with spatial polygons in rw
ext.rw.ss <- extract(rw[, "NAME_3"], ss)
## Loading required namespace: rgeos
ss$spatialname <- ext.rw.ss$NAME_3

frw <- fortify(rw, region="NAME_3")
ss.soil <- aggregate(ss@data[,soilVars], by=list(ss@data$spatialname), function(x){
  mean(x, na.rm=T)
})

plotReady <- dplyr::left_join(frw, ss.soil, by=c("id"="Group.1"))

Plot simple summary of soil characteristics

Consider revising these maps to a smaller greographic unit. Add the name of the location for uninitiated users.

library(RColorBrewer)
mapList <- list()
for(i in 1:length(soilVars)){
mapRes <-  ggplot(plotReady, aes(x=long, y=lat, group=group)) + geom_path() + 
  geom_polygon(aes(fill=plotReady[,soilVars[i]])) + 
  scale_fill_gradientn(colours = rev(brewer.pal(9,"Reds")), # define colors
                       name = soilVars[i],
                       guide = guide_colorbar(legend.direction = "vertical")) + 
  theme_bw() + 
  labs(title=paste("Rwanda long term soil health baseline - 2016", soilVars[i], sep= " "), x = "Longitude", y="Latitude")
mapList[[i]] <- mapRes
print(mapRes)
} 

# This is a small experiment to combine raster (spdf) and leaflet and be able to access the data in the raster interactively.
#mapLayer <- sp::merge(rw, ss.soil, by.x="NAME_3", by.y="Group.1")

# fill = T, fillOpacity = 0.7, fillColor = d.fill, 
#         stroke = T, color = "white", weight = 2, dashArray = 3, 
#         opacity = 0.5, popup = county.tt(d)

# leaflet(mapLayer) %>% addTiles() %>%
#   setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
#   addPolygons(
#     stroke = TRUE, opacity=0.2, smoothFactor = 0.5, 
#     fillColor=mapLayer$pH, fillOpacity = 0.5)
pdf(file=paste(md, "rw_shs_baseline_soil_maps.pdf", sep = "/"), width=11, height=8.5)
for(i in 1:length(mapList)){
print(mapList[[i]])
}  
dev.off()
## quartz_off_screen 
##                 2

Interpolations of soil health values

Interpolate soil health values for full operating area using soil health study values. We want to eventually add all Rwandan soil values into a single dataset to update and hone these values. See here for more guidanace

note:

  • Layer on the Tubura sites to the interpolated map.
  • Use leaflet so you can zoom in and out more easily.
  • Make it a raster layer that you can click on understand the values at different locations and the name of the location. I’ll need to make this a shiny app to have that functionality

The code below will run 5 K-fold cross validation to compare interpolation models. The output will be fed into the interpolate leaflet code below.

Check that I’m handling the projection correctly with Robert

# proj4string(ss) <- CRS("+init=epsg:4326")
# ss <- spTransform(ss, CRS=("+proj=utm +zone=36N +datum=WGS84"))
# root mean sq error for evaluating models
RMSE <- function(observed, predicted) {
  sqrt(mean((predicted - observed)^2, na.rm=TRUE))
}

# set k folds to 5
set.seed(20161030)
nfolds <- 5
k <- kfold(ss, nfolds) # from dismo

# cross validation of models
ensrmse <- tpsrmse <- idwrmse <- rep(NA, 5) # assing multiple objects at once

cv <- function(x) {
  
  for(i in 1:nfolds) {
    train <- ss[k!=i,]
    test <- ss[k==i,]

  train <- train[!is.na(train@data[,x]),]
  
  # inverse distance weights
  m <- gstat(formula=as.formula(paste(x, '~ 1')), locations=train)
  p1 <- predict(m, newdata=test, debug.level=0)$var1.pred
  idwrmse[i] <-  RMSE(test@data[,x], p1) #idw rsme
  
  # krieging
  # m <- autoKrige(formula=as.formula(paste(x, "~ 1")), input_data = train)
  # p2 <- predict(m, newdata=test, debug.level=0)$var1.pred
  # krigrmse[i] <-  RMSE(test$OZDLYAV, p2)
  
  # thin plate spline
  m <- Tps(coordinates(train), train@data[,x]) 
  p3 <- predict(m, coordinates(test))
  tpsrmse[i] <-  RMSE(test@data[,x], p3)
  
  w <- c(idwrmse[i], tpsrmse[i]) # combine the rmse
  weights <- w / sum(w) # weight them
  ensemble <- p1 * weights[1] + p3 * weights[2] 
  # multiply predictions by weights
  ensrmse[i] <-  RMSE(test@data[,x], ensemble) # truly an ensemble result?
  }
  
  output <- rbind(idwrmse, tpsrmse, ensrmse)
  return(output)
  
}

Now loop over the variables of interest where x is the soilVar variable.

output <- lapply(soilVars, function(x){
  ini <- data.frame(cv(x))
  ini$ave <- apply(ini[,1:5], 1, function(y){mean(y, na.rm=T)})
  res <- paste("Best model is ", row.names(ini[which.min(ini$ave),]),  sep = "")
  return(list(ini, res))
})

Interpolated soil maps

r <- raster(res=1/12)
r <- crop(r, floor(extent(rw)))

maps <- lapply(soilVars, function(x){
  m <- Tps(coordinates(ss), ss@data[,x])
  # make raster layer with model, raster is rwanda empty raster, model is m
  tps <- interpolate(r, m)
  tps <- crop(tps, rw)
  tps <- mask(tps, rw) # cuts the tps raster down to the rw boundaries
  x <- gsub("m3.", "", x)
  
  palColors <- leaflet::colorNumeric(palette = "Reds", values(tps), na.color = "transparent")
  
  suppressWarnings(
  leaflet() %>% addTiles() %>%
  addRasterImage(tps, colors=palColors, opacity = 0.8) %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addLegend(pal = palColors, values = values(tps), title = paste("Soil Value ", x, sep=""))
  )
})

save(maps,ss, file=paste(dd, "rw_baseline_interpolation_maps.Rdata", sep = "/"))

tagList(maps)

Print out the interpolated values for inclusion in the report.

## NULL
## NULL
## NULL
## NULL
## NULL
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## quartz_off_screen 
##                 2

Propensity Score Matching

We need to do a more rigorous job of accounting for differences between Tubura farmers and identified control farmers. Execute propensity score matching (PSM) to identify control farmers that overlap with Tubura farmers with regard to their likelihood of being a Tubura farmer.

cor(d[, grep("betail_", names(d))], use='complete.obs')
##                        betail_ownedn_inka betail_ownedn_ihene
## betail_ownedn_inka             1.00000000         0.036402152
## betail_ownedn_ihene            0.03640215         1.000000000
## betail_ownedn_inkoko           0.09740274         0.127424693
## betail_ownedn_ingurube         0.02166420        -0.009667001
## betail_ownedn_intama           0.04245052         0.002625236
##                        betail_ownedn_inkoko betail_ownedn_ingurube
## betail_ownedn_inka               0.09740274            0.021664199
## betail_ownedn_ihene              0.12742469           -0.009667001
## betail_ownedn_inkoko             1.00000000            0.049416370
## betail_ownedn_ingurube           0.04941637            1.000000000
## betail_ownedn_intama             0.03041599            0.025487680
##                        betail_ownedn_intama
## betail_ownedn_inka              0.042450521
## betail_ownedn_ihene             0.002625236
## betail_ownedn_inkoko            0.030415986
## betail_ownedn_ingurube          0.025487680
## betail_ownedn_intama            1.000000000
names(d)[grep("betail_", names(d))] <- c("cows", "goats", "chickens", "pigs", "sheep")
psmVars <- paste(c("female", "age", "hhsize", "total.seasons",
                   "cows", "goats", "chickens", "pigs", "sheep"),
                   collapse=" + ")

reg <- glm(as.formula(paste("client ~", psmVars, sep="")), 
           family= binomial(link="logit"), data=d)
summary(reg)    
## 
## Call:
## glm(formula = as.formula(paste("client ~", psmVars, sep = "")), 
##     family = binomial(link = "logit"), data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5522  -0.8456   0.0376   0.9250   2.0863  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.408940   0.203191   2.013   0.0442 *  
## female        -0.749686   0.097393  -7.698 1.39e-14 ***
## age           -0.024394   0.003378  -7.221 5.16e-13 ***
## hhsize         0.037275   0.022980   1.622   0.1048    
## total.seasons  0.530099   0.028803  18.404  < 2e-16 ***
## cows           0.007891   0.017934   0.440   0.6599    
## goats         -0.004426   0.030020  -0.147   0.8828    
## chickens       0.023011   0.017794   1.293   0.1959    
## pigs           0.052035   0.057838   0.900   0.3683    
## sheep          0.053211   0.070949   0.750   0.4533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3386.7  on 2442  degrees of freedom
## Residual deviance: 2566.6  on 2433  degrees of freedom
## AIC: 2586.6
## 
## Number of Fisher Scoring iterations: 5
# summarize predicted probabilities
pr <- data.frame(pr_score = predict(reg, type='response'), treat = d$client)

# graph
psmGraph <- ggplot() + geom_histogram(data=subset(pr, pr$treat==1), aes(x = pr_score, y=..count.., fill=as.factor(treat)), bins=80, position = "identity") +
    geom_histogram(data=subset(pr, pr$treat==0), aes(x=pr_score, y=-..count.., fill=as.factor(treat)), bins=80, position = "identity") +
    scale_y_continuous(limits=c(-150,150)) + 
  labs(title ="PSM score overlap", x = "PSM score", y="Farmer count",
       fill="Tubura/Control")

print(psmGraph)

pdf(file=paste(od, "rw_baseline_psm_overlap.pdf", sep="/"), height=8.5, width=11)
print(psmGraph)
dev.off()
## quartz_off_screen 
##                 2

Interpretation We have some overlap but it’s clear that Tubura farmers occupy a different range than the identified control farmers. Let’s continue with the PSM matching process but restrict ourselves to Tubura and control farmers that meet a certain PSM matching radius.

Notes on PSM process:

We have to indicate a variable for matching. I’m choosing pH as we know it to be an issue in most of our operating areas and addressing soil acidity has numerous residual benefits to soil health. I’ll want to do this for all soil outcomes, however.

  • As a next step, I’d like to identify the matched subset and then regress Tubura tenure on soil outcomes. Does that make sense?
  • I chose a caliper of 0.25. I should review this process with Maya to make certain I’m following best practice. I sort of just pulled that sd figure out of the sky.
  • An initial check of the match quality using a common model indicates that the matches are poor.
# PSM prep
tr <- cbind(d$client)
x <- d[, unlist(strsplit(psmVars, " + ", fixed=T))]
y <- c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C")


# PSM
set.seed(20161102)
m <- lapply(y, function(response){

  suppressWarnings(
  mod <- Match(Y = d[,response], Tr = tr, X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")    
  )
  matchRes <- MatchBalance(tr ~ d[,response], match.out=mod, nboots=500, data=d, print.level = 0)
  return(list(mod, matchRes))
})
#lapply(m, summary)

Now check the naive model approach for PSM balance.

matchRes <- do.call(rbind, lapply(1:length(m), function(model){
  val <- as.data.frame(cbind(
    standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff, 
    var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
    sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
  return(val)
}))
rownames(matchRes) <- y
print(kable(matchRes))
standard.diff var.ratio sdiff.adj
m3.Ca -1.678540 0.8498607 -0.0167854
m3.Mg -12.306011 0.7311426 -0.1230601
pH 6.233730 1.0749892 0.0623373
Total.N -1.873108 0.9551298 -0.0187311
Total.C -2.495694 1.0376312 -0.0249569

Interpretation: We want to see standard mean differences less than the absolute value of 0.25 (or 0.1 if we’re being conservative) and variance ratios close to 1 but certainly between 0.5 and 2.

According to the CRAN summary, sdiff is the standardized difference between the treatment and control units multiplied by 100. If I divide by 100, the values come much closer to reasonable value.

Let the PSM models vary

The common model approach doesn’t seem to be working for any of the variables. I’m going to rework the modeling approach so we can fit different models for each outcome upon which we’re trying to match.

d$age2 <- d$age^2
d$hhsize_age <- d$hhsize*d$age
d$hhsize2 <- d$hhsize^2

coreVars = c("female", "age", "hhsize", "own", "as.factor(cell)", "cows", "goats", "chickens", "pigs", "sheep")

psmList <- list(
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="m3.Ca"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="m3.Mg"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="pH"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="Total.N"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="Total.C"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_fert"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_compost"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_seasons_leg_1"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_fallow"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="logFert"),
  list(tr = "client",
       psmVars = paste(c(coreVars, "age2",
                   "hhsize2"),
                   collapse=" + "),
       y="n_seasons_leg_2"),
  list(tr = "client",
       psmVars = paste(c(coreVars, "age2",
                   "hhsize2"),
                   collapse=" + "),
       y="n_season_lime")
  
)

# list(tr = "client",
#        psmVars = paste(c("female", "age", "hhsize","as.factor(cell)",
#                    "cows", "goats", "chickens", "pigs", "sheep", "age2",
#                    "hhsize2"),
#                    collapse=" + "),
#        y="fert1.are"),
#   list(tr = "client",
#        psmVars = paste(c("female", "age", "hhsize","as.factor(cell)",
#                    "cows", "goats", "chickens", "pigs", "sheep", "age2",
#                    "hhsize2"),
#                    collapse=" + "),
#        y="fert2.are"),
#   list(tr = "client",
#        psmVars = paste(c("female", "age", "hhsize","as.factor(cell)",
#                    "cows", "goats", "chickens", "pigs", "sheep", "age2",
#                    "hhsize2"),
#                    collapse=" + "),
#        y="compost.are"),
#   list(tr = "client",
#        psmVars = paste(c("female", "age", "hhsize","as.factor(cell)",
#                    "cows", "goats", "chickens", "pigs", "sheep", "age2",
#                    "hhsize2"),
#                    collapse=" + "),
#        y="lime.are")


# wanted to include alt but we have missing values - resolve this.
# also skipping lime because of a small number of positive values

# PSM
set.seed(20161102)
m <- lapply(psmList, function(listInput){

  # keep complete cases of outcome variable
  k <- d[complete.cases(d[,listInput$y]),]
  
  # run glm regression:
  reg <- glm(as.formula(paste(listInput$tr, "~", listInput$psmVars, sep="")),  family= binomial(link="logit"), data=k)
  
  suppressWarnings(
  mod <- Match(Y = k[,listInput$y], Tr = k[,listInput$tr], X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")   
  )
  matchRes <- MatchBalance(k[,listInput$tr] ~ k[,listInput$y], match.out=mod, nboots=500, data=k, print.level = 0)
  #print(listInput$y)
  return(list(mod, matchRes))
  
})

The models can now vary by outcome. Let’s see if we can improve our results.

matchRes <- do.call(rbind, lapply(1:length(m), function(model){
  val <- as.data.frame(cbind(
    standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff, 
    var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
    sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
  return(val)
}))

namesInput <- NULL
for(i in 1:length(psmList)){
  namesInput[i] <- psmList[[i]]$y
}
rownames(matchRes) <- namesInput
print(kable(matchRes))
standard.diff var.ratio sdiff.adj
m3.Ca -4.809180 0.8986489 -0.0480918
m3.Mg -6.314048 0.9283896 -0.0631405
pH -3.357291 1.0112081 -0.0335729
Total.N -5.748301 0.9902182 -0.0574830
Total.C -2.528369 1.0534541 -0.0252837
n_season_fert 79.369281 3.3835173 0.7936928
n_season_compost 10.225241 0.9602097 0.1022524
n_seasons_leg_1 5.699451 0.8756917 0.0569945
n_season_fallow 6.008369 1.2701536 0.0600837
logFert 102.487739 1.7470221 1.0248774
n_seasons_leg_2 -13.214481 0.9274454 -0.1321448
n_season_lime 12.900263 1.4441547 0.1290026

Interpretation If I divide the standardized mean differences by 100, we meet the balance criteria of the standardized mean difference being close to 0 and the variance being close to 1. Let’s print out the model results to see how Tubura and control farmers compare on key soil meterics. These results should supercede the naive balance tables presented above.

We achieve acceptable balance for the soil attributes but we don’t for seasons of fertilizer use. This is isn’t entirely unexpected given that Tubura’s primary service is providing fertilizer inputs and training.

coefTable <- do.call(rbind, lapply(1:length(m), function(model){
  beta = round(m[[model]][[1]]$est.noadj,3)
  mean.Tr = round(m[[model]][[2]]$AfterMatching[[1]][[3]], 2)
  mean.Co = round(m[[model]][[2]]$AfterMatching[[1]][[4]], 2)
  pval = m[[model]][[2]]$AfterMatching[[1]][[10]][[3]] # p.value
  #pval = (1 - pnorm(abs(m[[model]][[1]]$est/m[[model]][[1]]$se.standard))) * 2
  pval = ifelse(pval < 0.001, "0.001", round(pval, 3))
  
  res = data.frame(beta, mean.Tr, mean.Co, pval)
  return(res)
}))
row.names(coefTable) <- namesInput
coefTable$pval.adj <- round(p.adjust(coefTable$pval, method="fdr"),3)
print(kable(coefTable))
beta mean.Tr mean.Co pval pval.adj
m3.Ca -25.577 854.22 879.80 0.151 0.181
m3.Mg -7.769 204.04 211.81 0.058 0.099
pH -0.019 5.52 5.54 0.289 0.315
Total.N -0.002 0.16 0.16 0.081 0.116
Total.C -0.013 2.10 2.12 0.432 0.432
n_season_fert 2.573 3.42 0.84 0.001 0.003
n_season_compost 0.384 5.98 5.60 0.002 0.005
n_seasons_leg_1 0.151 2.26 2.11 0.087 0.116
n_season_fallow 0.101 0.67 0.57 0.049 0.098
logFert 0.821 1.19 0.36 0.001 0.003
n_seasons_leg_2 -0.421 2.62 3.04 0.001 0.003
n_season_lime 0.087 0.22 0.14 0.001 0.003

Calculate farmers under soil health thresholds

thresh <- d %>% group_by(client) %>% summarize(
  count = n(),
  ph = sum(pH<5.8),
  carbon = sum(Total.C < 2),
  nitrogen = sum(Total.N < 0.1),
  calcium = sum(m3.Ca < 720),
  magnesium = sum(m3.Mg < 100)
) %>% mutate(
  under.ph = paste(paste(round(ph/count,4)*100, "%", sep=""), " (", ph, ")", sep=""),
  under.carbon = paste(paste(round(carbon/count,4)*100,"%", sep=""), " (", carbon, ")", sep=""),
  under.nitrogen = paste(paste(round(nitrogen/count,4)*100, "%", sep=""), " (", nitrogen, ")", sep=""),
  under.calcium = paste(paste(round(calcium/count,4)*100, "%", sep=""), " (", calcium, ")", sep=""),
  under.mag = paste(paste(round(magnesium/count,4)*100,"%", sep=""), " (", magnesium, ")", sep="")
) %>% as.data.frame() 

thresh <- thresh[, c("client", names(thresh)[grep("under", names(thresh))])]
thresh <- t(thresh)
colnames(thresh) = thresh[1, ] # the first row will be the header
colnames(thresh) = c("non-client", "client")
thresh = thresh[-1, ]

write.csv(thresh, file=paste(od, "table1_rw_thresholds.csv", sep = "/"), row.names = T)
write.csv(coefTable, file=paste(od, "psm coefficients.csv", sep = "/"),
          row.names = T)

# sort by the order Eric wants
coefTableES1 <- coefTable[c(3,5,4,1,2),]

write.csv(coefTableES1, file=paste(od, "psm coefficients ordered for ES.csv", sep = "/"))

# 11/17 added lime
coefTableES2 <- coefTable[c(6,7,12,9,8,11),]

write.csv(coefTableES2, file=paste(od, "psm coefficients ordered for ES_agprac.csv", sep = "/"))

Interpretation: Propensity score matching gives us a comparable treatment and control group. The table above shows that after matching on those characteristics, there are effectively no differences between One Acre Fund farmer and Tubura farmers on soil attributes. The unadjusted p-values show 1AF farmers to have slow levels of soil nitrogen but this finding disappears if we account for running multiple matching models.

When we expand the outcome variable set to include practice variables, we first no longer get a good propensity score match for all variables.

  • Tubura participation so strongly overlaps with fertilizer use that the comparison group is no longer comparable. The resulting model shows a significant difference in seasons of fertilizer use.
  • Tubura farmers have used compost for 0.3 more seasons than non-Tubura farmers. We don’t observe a difference in N or C levels in the soil attributes however.
  • We don’t see a difference between Tubura and non-Tubura farmers on seasons of legumes being the primary crop in the study plot.

Study group balance post-match

See here for some guidance on hwo to use weights to reconstruct the group balance following the matches.

See here for weighted t.test documentation

suppressMessages(library(weights))
tableVars <- c("age", "female", "hhsize", "own")

postMatch <- do.call(rbind, lapply(1:length(m), function(model){
  
  innerPost <- do.call(rbind, lapply(tableVars, function(x){
  
  mean.t = weighted.mean(d[m[[model]][[1]]$index.treated,][,x], m[[model]][[1]]$weights)
  mean.c = weighted.mean(d[m[[model]][[1]]$index.control,][,x], m[[model]][[1]]$weights)
  
  # combined data
  dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,], 
              d[m[[model]][[1]]$index.control,]))
  
  test = wtd.t.test(d[m[[model]][[1]]$index.treated,][,x], 
                    d[m[[model]][[1]]$index.control,][,x],
                    weight=m[[model]][[1]]$weights,
                    samedata=TRUE)
  
  return(data.frame(model.num = model, 
                    outcome=x, 
                    tr=mean.t, 
                    contr = mean.c,
                    pval = test$coefficients[3][[1]]))
  }))
  
  return(innerPost)
}))
write.csv(postMatch, file=paste(od, "rw post match covars.csv", sep = "/"),
          row.names = F)

Models with PSM matched farmers

Per Robert’s suggestion, now that we’ve matched Tubura and non-Tubura farmers, let’s assess the severity of Tubura tenure on key soil health outcomes.

tenureTab <- lapply(1:length(m), function(model){
  
  dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,], 
              d[m[[model]][[1]]$index.control,]))
  dm$client_tenure <- dm$client*dm$total.seasons
  mod <- lm(as.formula(paste("dm[,psmList[[model]]$y] ~", "total.seasons + as.factor(cell)", sep ="")), data=dm)
  return(mod)
})
modNames <- c("Calcium", "Magnesium", "pH", "Nitrogen", "Carbon", "Seasons fertilizer", "Seasons compost", "Seasons legumes", "Seasons fallow", "Fertilizer (log)", "Season Sec. legumes")

suppressWarnings(
stargazer(tenureTab, type="html", 
          title = "2016A Rwanda Soil Health Baseline - PSM Tenure",
          covariate.labels = "One Acre Fund Tenure", 
          dep.var.labels = "",
          column.labels = modNames,
          #column.labels = c(gsub("m3.", "", as.vector(namesInput))),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_matched_tenure.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - PSM Tenure
Dependent variable:
Calcium Magnesium pH Nitrogen Carbon Seasons fertilizer Seasons compost Seasons legumes Seasons fallow Fertilizer (log) Season Sec. legumes
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12)
One Acre Fund Tenure 0.406 -0.235 0.004* -0.0002 -0.001 0.539*** 0.154*** 0.058*** 0.010 0.153*** -0.053*** 0.016***
(2.149) (0.469) (0.002) (0.0001) (0.002) (0.012) (0.018) (0.013) (0.009) (0.004) (0.016) (0.003)
Constant 1,732.567*** 329.786*** 5.970*** 0.190*** 2.436*** 1.919*** 4.789*** 0.878** 0.899*** 0.953*** 2.105 0.752***
(78.745) (17.231) (0.076) (0.004) (0.082) (0.433) (0.633) (0.439) (0.312) (0.129) (1.897) (0.122)
Observations 3,824 3,824 3,824 3,824 3,824 3,824 3,824 3,824 3,824 3,824 3,771 3,800
R2 0.568 0.607 0.628 0.511 0.477 0.496 0.371 0.416 0.171 0.483 0.352 0.203
Adjusted R2 0.544 0.585 0.608 0.484 0.449 0.469 0.336 0.384 0.125 0.455 0.315 0.159
Residual Std. Error 369.039 (df = 3623) 80.763 (df = 3623) 0.354 (df = 3623) 0.021 (df = 3623) 0.383 (df = 3623) 2.120 (df = 3623) 3.098 (df = 3623) 2.147 (df = 3623) 1.494 (df = 3623) 0.606 (df = 3623) 2.683 (df = 3569) 0.573 (df = 3599)
F Statistic 23.836*** (df = 200; 3623) 27.996*** (df = 200; 3623) 30.609*** (df = 200; 3623) 18.923*** (df = 200; 3623) 16.552*** (df = 200; 3623) 17.861*** (df = 200; 3623) 10.691*** (df = 200; 3623) 12.922*** (df = 200; 3623) 3.732*** (df = 200; 3623) 16.957*** (df = 200; 3623) 9.634*** (df = 201; 3569) 4.598*** (df = 200; 3599)
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation: Using a PSM matched sample, the models above assess the effects of additional years of farming with Tubura. Numerous control farmers have also been Tubura farmers in previous seasons. Thus, I’m keeping the model simple instead of adding a client*tenure interaction. We can easily test that as well though.

–end